• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Machine check handler.
3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4  * Rest from unknown author(s).
5  * 2004 Andi Kleen. Rewrote most of it.
6  */
7 
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/smp_lock.h>
13 #include <linux/string.h>
14 #include <linux/rcupdate.h>
15 #include <linux/kallsyms.h>
16 #include <linux/sysdev.h>
17 #include <linux/miscdevice.h>
18 #include <linux/fs.h>
19 #include <linux/capability.h>
20 #include <linux/cpu.h>
21 #include <linux/percpu.h>
22 #include <linux/poll.h>
23 #include <linux/thread_info.h>
24 #include <linux/ctype.h>
25 #include <linux/kmod.h>
26 #include <linux/kdebug.h>
27 #include <asm/processor.h>
28 #include <asm/msr.h>
29 #include <asm/mce.h>
30 #include <asm/uaccess.h>
31 #include <asm/smp.h>
32 #include <asm/idle.h>
33 
34 #define MISC_MCELOG_MINOR 227
35 #define NR_SYSFS_BANKS 6
36 
37 atomic_t mce_entry;
38 
39 static int mce_dont_init;
40 
41 /*
42  * Tolerant levels:
43  *   0: always panic on uncorrected errors, log corrected errors
44  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
45  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
46  *   3: never panic or SIGBUS, log all errors (for testing only)
47  */
48 static int tolerant = 1;
49 static int banks;
50 static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
51 static unsigned long notify_user;
52 static int rip_msr;
53 static int mce_bootlog = -1;
54 static atomic_t mce_events;
55 
56 static char trigger[128];
57 static char *trigger_argv[2] = { trigger, NULL };
58 
59 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
60 
61 /*
62  * Lockless MCE logging infrastructure.
63  * This avoids deadlocks on printk locks without having to break locks. Also
64  * separate MCEs from kernel messages to avoid bogus bug reports.
65  */
66 
67 static struct mce_log mcelog = {
68 	MCE_LOG_SIGNATURE,
69 	MCE_LOG_LEN,
70 };
71 
mce_log(struct mce * mce)72 void mce_log(struct mce *mce)
73 {
74 	unsigned next, entry;
75 	atomic_inc(&mce_events);
76 	mce->finished = 0;
77 	wmb();
78 	for (;;) {
79 		entry = rcu_dereference(mcelog.next);
80 		for (;;) {
81 			/* When the buffer fills up discard new entries. Assume
82 			   that the earlier errors are the more interesting. */
83 			if (entry >= MCE_LOG_LEN) {
84 				set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
85 				return;
86 			}
87 			/* Old left over entry. Skip. */
88 			if (mcelog.entry[entry].finished) {
89 				entry++;
90 				continue;
91 			}
92 			break;
93 		}
94 		smp_rmb();
95 		next = entry + 1;
96 		if (cmpxchg(&mcelog.next, entry, next) == entry)
97 			break;
98 	}
99 	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
100 	wmb();
101 	mcelog.entry[entry].finished = 1;
102 	wmb();
103 
104 	set_bit(0, &notify_user);
105 }
106 
print_mce(struct mce * m)107 static void print_mce(struct mce *m)
108 {
109 	printk(KERN_EMERG "\n"
110 	       KERN_EMERG "HARDWARE ERROR\n"
111 	       KERN_EMERG
112 	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
113 	       m->cpu, m->mcgstatus, m->bank, m->status);
114 	if (m->ip) {
115 		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
116 		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
117 		       m->cs, m->ip);
118 		if (m->cs == __KERNEL_CS)
119 			print_symbol("{%s}", m->ip);
120 		printk("\n");
121 	}
122 	printk(KERN_EMERG "TSC %Lx ", m->tsc);
123 	if (m->addr)
124 		printk("ADDR %Lx ", m->addr);
125 	if (m->misc)
126 		printk("MISC %Lx ", m->misc);
127 	printk("\n");
128 	printk(KERN_EMERG "This is not a software problem!\n");
129 	printk(KERN_EMERG "Run through mcelog --ascii to decode "
130 	       "and contact your hardware vendor\n");
131 }
132 
mce_panic(char * msg,struct mce * backup,unsigned long start)133 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
134 {
135 	int i;
136 
137 	oops_begin();
138 	for (i = 0; i < MCE_LOG_LEN; i++) {
139 		unsigned long tsc = mcelog.entry[i].tsc;
140 
141 		if (time_before(tsc, start))
142 			continue;
143 		print_mce(&mcelog.entry[i]);
144 		if (backup && mcelog.entry[i].tsc == backup->tsc)
145 			backup = NULL;
146 	}
147 	if (backup)
148 		print_mce(backup);
149 	panic(msg);
150 }
151 
mce_available(struct cpuinfo_x86 * c)152 static int mce_available(struct cpuinfo_x86 *c)
153 {
154 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
155 }
156 
mce_get_rip(struct mce * m,struct pt_regs * regs)157 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
158 {
159 	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
160 		m->ip = regs->ip;
161 		m->cs = regs->cs;
162 	} else {
163 		m->ip = 0;
164 		m->cs = 0;
165 	}
166 	if (rip_msr) {
167 		/* Assume the RIP in the MSR is exact. Is this true? */
168 		m->mcgstatus |= MCG_STATUS_EIPV;
169 		rdmsrl(rip_msr, m->ip);
170 		m->cs = 0;
171 	}
172 }
173 
174 /*
175  * The actual machine check handler
176  */
do_machine_check(struct pt_regs * regs,long error_code)177 void do_machine_check(struct pt_regs * regs, long error_code)
178 {
179 	struct mce m, panicm;
180 	u64 mcestart = 0;
181 	int i;
182 	int panicm_found = 0;
183 	/*
184 	 * If no_way_out gets set, there is no safe way to recover from this
185 	 * MCE.  If tolerant is cranked up, we'll try anyway.
186 	 */
187 	int no_way_out = 0;
188 	/*
189 	 * If kill_it gets set, there might be a way to recover from this
190 	 * error.
191 	 */
192 	int kill_it = 0;
193 
194 	atomic_inc(&mce_entry);
195 
196 	if ((regs
197 	     && notify_die(DIE_NMI, "machine check", regs, error_code,
198 			   18, SIGKILL) == NOTIFY_STOP)
199 	    || !banks)
200 		goto out2;
201 
202 	memset(&m, 0, sizeof(struct mce));
203 	m.cpu = smp_processor_id();
204 	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
205 	/* if the restart IP is not valid, we're done for */
206 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
207 		no_way_out = 1;
208 
209 	rdtscll(mcestart);
210 	barrier();
211 
212 	for (i = 0; i < banks; i++) {
213 		if (i < NR_SYSFS_BANKS && !bank[i])
214 			continue;
215 
216 		m.misc = 0;
217 		m.addr = 0;
218 		m.bank = i;
219 		m.tsc = 0;
220 
221 		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
222 		if ((m.status & MCI_STATUS_VAL) == 0)
223 			continue;
224 
225 		if (m.status & MCI_STATUS_EN) {
226 			/* if PCC was set, there's no way out */
227 			no_way_out |= !!(m.status & MCI_STATUS_PCC);
228 			/*
229 			 * If this error was uncorrectable and there was
230 			 * an overflow, we're in trouble.  If no overflow,
231 			 * we might get away with just killing a task.
232 			 */
233 			if (m.status & MCI_STATUS_UC) {
234 				if (tolerant < 1 || m.status & MCI_STATUS_OVER)
235 					no_way_out = 1;
236 				kill_it = 1;
237 			}
238 		}
239 
240 		if (m.status & MCI_STATUS_MISCV)
241 			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
242 		if (m.status & MCI_STATUS_ADDRV)
243 			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
244 
245 		mce_get_rip(&m, regs);
246 		if (error_code >= 0)
247 			rdtscll(m.tsc);
248 		if (error_code != -2)
249 			mce_log(&m);
250 
251 		/* Did this bank cause the exception? */
252 		/* Assume that the bank with uncorrectable errors did it,
253 		   and that there is only a single one. */
254 		if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
255 			panicm = m;
256 			panicm_found = 1;
257 		}
258 
259 		add_taint(TAINT_MACHINE_CHECK);
260 	}
261 
262 	/* Never do anything final in the polling timer */
263 	if (!regs)
264 		goto out;
265 
266 	/* If we didn't find an uncorrectable error, pick
267 	   the last one (shouldn't happen, just being safe). */
268 	if (!panicm_found)
269 		panicm = m;
270 
271 	/*
272 	 * If we have decided that we just CAN'T continue, and the user
273 	 *  has not set tolerant to an insane level, give up and die.
274 	 */
275 	if (no_way_out && tolerant < 3)
276 		mce_panic("Machine check", &panicm, mcestart);
277 
278 	/*
279 	 * If the error seems to be unrecoverable, something should be
280 	 * done.  Try to kill as little as possible.  If we can kill just
281 	 * one task, do that.  If the user has set the tolerance very
282 	 * high, don't try to do anything at all.
283 	 */
284 	if (kill_it && tolerant < 3) {
285 		int user_space = 0;
286 
287 		/*
288 		 * If the EIPV bit is set, it means the saved IP is the
289 		 * instruction which caused the MCE.
290 		 */
291 		if (m.mcgstatus & MCG_STATUS_EIPV)
292 			user_space = panicm.ip && (panicm.cs & 3);
293 
294 		/*
295 		 * If we know that the error was in user space, send a
296 		 * SIGBUS.  Otherwise, panic if tolerance is low.
297 		 *
298 		 * force_sig() takes an awful lot of locks and has a slight
299 		 * risk of deadlocking.
300 		 */
301 		if (user_space) {
302 			force_sig(SIGBUS, current);
303 		} else if (panic_on_oops || tolerant < 2) {
304 			mce_panic("Uncorrected machine check",
305 				&panicm, mcestart);
306 		}
307 	}
308 
309 	/* notify userspace ASAP */
310 	set_thread_flag(TIF_MCE_NOTIFY);
311 
312  out:
313 	/* the last thing we do is clear state */
314 	for (i = 0; i < banks; i++)
315 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
316 	wrmsrl(MSR_IA32_MCG_STATUS, 0);
317  out2:
318 	atomic_dec(&mce_entry);
319 }
320 
321 #ifdef CONFIG_X86_MCE_INTEL
322 /***
323  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
324  * @cpu: The CPU on which the event occurred.
325  * @status: Event status information
326  *
327  * This function should be called by the thermal interrupt after the
328  * event has been processed and the decision was made to log the event
329  * further.
330  *
331  * The status parameter will be saved to the 'status' field of 'struct mce'
332  * and historically has been the register value of the
333  * MSR_IA32_THERMAL_STATUS (Intel) msr.
334  */
mce_log_therm_throt_event(unsigned int cpu,__u64 status)335 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
336 {
337 	struct mce m;
338 
339 	memset(&m, 0, sizeof(m));
340 	m.cpu = cpu;
341 	m.bank = MCE_THERMAL_BANK;
342 	m.status = status;
343 	rdtscll(m.tsc);
344 	mce_log(&m);
345 }
346 #endif /* CONFIG_X86_MCE_INTEL */
347 
348 /*
349  * Periodic polling timer for "silent" machine check errors.  If the
350  * poller finds an MCE, poll 2x faster.  When the poller finds no more
351  * errors, poll 2x slower (up to check_interval seconds).
352  */
353 
354 static int check_interval = 5 * 60; /* 5 minutes */
355 static int next_interval; /* in jiffies */
356 static void mcheck_timer(struct work_struct *work);
357 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
358 
mcheck_check_cpu(void * info)359 static void mcheck_check_cpu(void *info)
360 {
361 	if (mce_available(&current_cpu_data))
362 		do_machine_check(NULL, 0);
363 }
364 
mcheck_timer(struct work_struct * work)365 static void mcheck_timer(struct work_struct *work)
366 {
367 	on_each_cpu(mcheck_check_cpu, NULL, 1);
368 
369 	/*
370 	 * Alert userspace if needed.  If we logged an MCE, reduce the
371 	 * polling interval, otherwise increase the polling interval.
372 	 */
373 	if (mce_notify_user()) {
374 		next_interval = max(next_interval/2, HZ/100);
375 	} else {
376 		next_interval = min(next_interval * 2,
377 				(int)round_jiffies_relative(check_interval*HZ));
378 	}
379 
380 	schedule_delayed_work(&mcheck_work, next_interval);
381 }
382 
383 /*
384  * This is only called from process context.  This is where we do
385  * anything we need to alert userspace about new MCEs.  This is called
386  * directly from the poller and also from entry.S and idle, thanks to
387  * TIF_MCE_NOTIFY.
388  */
mce_notify_user(void)389 int mce_notify_user(void)
390 {
391 	clear_thread_flag(TIF_MCE_NOTIFY);
392 	if (test_and_clear_bit(0, &notify_user)) {
393 		static unsigned long last_print;
394 		unsigned long now = jiffies;
395 
396 		wake_up_interruptible(&mce_wait);
397 		if (trigger[0])
398 			call_usermodehelper(trigger, trigger_argv, NULL,
399 						UMH_NO_WAIT);
400 
401 		if (time_after_eq(now, last_print + (check_interval*HZ))) {
402 			last_print = now;
403 			printk(KERN_INFO "Machine check events logged\n");
404 		}
405 
406 		return 1;
407 	}
408 	return 0;
409 }
410 
411 /* see if the idle task needs to notify userspace */
412 static int
mce_idle_callback(struct notifier_block * nfb,unsigned long action,void * junk)413 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
414 {
415 	/* IDLE_END should be safe - interrupts are back on */
416 	if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
417 		mce_notify_user();
418 
419 	return NOTIFY_OK;
420 }
421 
422 static struct notifier_block mce_idle_notifier = {
423 	.notifier_call = mce_idle_callback,
424 };
425 
periodic_mcheck_init(void)426 static __init int periodic_mcheck_init(void)
427 {
428 	next_interval = check_interval * HZ;
429 	if (next_interval)
430 		schedule_delayed_work(&mcheck_work,
431 				      round_jiffies_relative(next_interval));
432 	idle_notifier_register(&mce_idle_notifier);
433 	return 0;
434 }
435 __initcall(periodic_mcheck_init);
436 
437 
438 /*
439  * Initialize Machine Checks for a CPU.
440  */
mce_init(void * dummy)441 static void mce_init(void *dummy)
442 {
443 	u64 cap;
444 	int i;
445 
446 	rdmsrl(MSR_IA32_MCG_CAP, cap);
447 	banks = cap & 0xff;
448 	if (banks > MCE_EXTENDED_BANK) {
449 		banks = MCE_EXTENDED_BANK;
450 		printk(KERN_INFO "MCE: warning: using only %d banks\n",
451 		       MCE_EXTENDED_BANK);
452 	}
453 	/* Use accurate RIP reporting if available. */
454 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
455 		rip_msr = MSR_IA32_MCG_EIP;
456 
457 	/* Log the machine checks left over from the previous reset.
458 	   This also clears all registers */
459 	do_machine_check(NULL, mce_bootlog ? -1 : -2);
460 
461 	set_in_cr4(X86_CR4_MCE);
462 
463 	if (cap & MCG_CTL_P)
464 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
465 
466 	for (i = 0; i < banks; i++) {
467 		if (i < NR_SYSFS_BANKS)
468 			wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 		else
470 			wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471 
472 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
473 	}
474 }
475 
476 /* Add per CPU specific workarounds here */
mce_cpu_quirks(struct cpuinfo_x86 * c)477 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
478 {
479 	/* This should be disabled by the BIOS, but isn't always */
480 	if (c->x86_vendor == X86_VENDOR_AMD) {
481 		if(c->x86 == 15)
482 			/* disable GART TBL walk error reporting, which trips off
483 			   incorrectly with the IOMMU & 3ware & Cerberus. */
484 			clear_bit(10, &bank[4]);
485 		if(c->x86 <= 17 && mce_bootlog < 0)
486 			/* Lots of broken BIOS around that don't clear them
487 			   by default and leave crap in there. Don't log. */
488 			mce_bootlog = 0;
489 	}
490 
491 }
492 
mce_cpu_features(struct cpuinfo_x86 * c)493 static void mce_cpu_features(struct cpuinfo_x86 *c)
494 {
495 	switch (c->x86_vendor) {
496 	case X86_VENDOR_INTEL:
497 		mce_intel_feature_init(c);
498 		break;
499 	case X86_VENDOR_AMD:
500 		mce_amd_feature_init(c);
501 		break;
502 	default:
503 		break;
504 	}
505 }
506 
507 /*
508  * Called for each booted CPU to set up machine checks.
509  * Must be called with preempt off.
510  */
mcheck_init(struct cpuinfo_x86 * c)511 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512 {
513 	mce_cpu_quirks(c);
514 
515 	if (mce_dont_init ||
516 	    !mce_available(c))
517 		return;
518 
519 	mce_init(NULL);
520 	mce_cpu_features(c);
521 }
522 
523 /*
524  * Character device to read and clear the MCE log.
525  */
526 
527 static DEFINE_SPINLOCK(mce_state_lock);
528 static int open_count;	/* #times opened */
529 static int open_exclu;	/* already open exclusive? */
530 
mce_open(struct inode * inode,struct file * file)531 static int mce_open(struct inode *inode, struct file *file)
532 {
533 	lock_kernel();
534 	spin_lock(&mce_state_lock);
535 
536 	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
537 		spin_unlock(&mce_state_lock);
538 		unlock_kernel();
539 		return -EBUSY;
540 	}
541 
542 	if (file->f_flags & O_EXCL)
543 		open_exclu = 1;
544 	open_count++;
545 
546 	spin_unlock(&mce_state_lock);
547 	unlock_kernel();
548 
549 	return nonseekable_open(inode, file);
550 }
551 
mce_release(struct inode * inode,struct file * file)552 static int mce_release(struct inode *inode, struct file *file)
553 {
554 	spin_lock(&mce_state_lock);
555 
556 	open_count--;
557 	open_exclu = 0;
558 
559 	spin_unlock(&mce_state_lock);
560 
561 	return 0;
562 }
563 
collect_tscs(void * data)564 static void collect_tscs(void *data)
565 {
566 	unsigned long *cpu_tsc = (unsigned long *)data;
567 
568 	rdtscll(cpu_tsc[smp_processor_id()]);
569 }
570 
mce_read(struct file * filp,char __user * ubuf,size_t usize,loff_t * off)571 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
572 			loff_t *off)
573 {
574 	unsigned long *cpu_tsc;
575 	static DEFINE_MUTEX(mce_read_mutex);
576 	unsigned next;
577 	char __user *buf = ubuf;
578 	int i, err;
579 
580 	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
581 	if (!cpu_tsc)
582 		return -ENOMEM;
583 
584 	mutex_lock(&mce_read_mutex);
585 	next = rcu_dereference(mcelog.next);
586 
587 	/* Only supports full reads right now */
588 	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
589 		mutex_unlock(&mce_read_mutex);
590 		kfree(cpu_tsc);
591 		return -EINVAL;
592 	}
593 
594 	err = 0;
595 	for (i = 0; i < next; i++) {
596 		unsigned long start = jiffies;
597 
598 		while (!mcelog.entry[i].finished) {
599 			if (time_after_eq(jiffies, start + 2)) {
600 				memset(mcelog.entry + i,0, sizeof(struct mce));
601 				goto timeout;
602 			}
603 			cpu_relax();
604 		}
605 		smp_rmb();
606 		err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
607 		buf += sizeof(struct mce);
608  timeout:
609 		;
610 	}
611 
612 	memset(mcelog.entry, 0, next * sizeof(struct mce));
613 	mcelog.next = 0;
614 
615 	synchronize_sched();
616 
617 	/*
618 	 * Collect entries that were still getting written before the
619 	 * synchronize.
620 	 */
621 	on_each_cpu(collect_tscs, cpu_tsc, 1);
622 	for (i = next; i < MCE_LOG_LEN; i++) {
623 		if (mcelog.entry[i].finished &&
624 		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
625 			err |= copy_to_user(buf, mcelog.entry+i,
626 					    sizeof(struct mce));
627 			smp_rmb();
628 			buf += sizeof(struct mce);
629 			memset(&mcelog.entry[i], 0, sizeof(struct mce));
630 		}
631 	}
632 	mutex_unlock(&mce_read_mutex);
633 	kfree(cpu_tsc);
634 	return err ? -EFAULT : buf - ubuf;
635 }
636 
mce_poll(struct file * file,poll_table * wait)637 static unsigned int mce_poll(struct file *file, poll_table *wait)
638 {
639 	poll_wait(file, &mce_wait, wait);
640 	if (rcu_dereference(mcelog.next))
641 		return POLLIN | POLLRDNORM;
642 	return 0;
643 }
644 
mce_ioctl(struct file * f,unsigned int cmd,unsigned long arg)645 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
646 {
647 	int __user *p = (int __user *)arg;
648 
649 	if (!capable(CAP_SYS_ADMIN))
650 		return -EPERM;
651 	switch (cmd) {
652 	case MCE_GET_RECORD_LEN:
653 		return put_user(sizeof(struct mce), p);
654 	case MCE_GET_LOG_LEN:
655 		return put_user(MCE_LOG_LEN, p);
656 	case MCE_GETCLEAR_FLAGS: {
657 		unsigned flags;
658 
659 		do {
660 			flags = mcelog.flags;
661 		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
662 		return put_user(flags, p);
663 	}
664 	default:
665 		return -ENOTTY;
666 	}
667 }
668 
669 static const struct file_operations mce_chrdev_ops = {
670 	.open = mce_open,
671 	.release = mce_release,
672 	.read = mce_read,
673 	.poll = mce_poll,
674 	.unlocked_ioctl = mce_ioctl,
675 };
676 
677 static struct miscdevice mce_log_device = {
678 	MISC_MCELOG_MINOR,
679 	"mcelog",
680 	&mce_chrdev_ops,
681 };
682 
683 static unsigned long old_cr4 __initdata;
684 
stop_mce(void)685 void __init stop_mce(void)
686 {
687 	old_cr4 = read_cr4();
688 	clear_in_cr4(X86_CR4_MCE);
689 }
690 
restart_mce(void)691 void __init restart_mce(void)
692 {
693 	if (old_cr4 & X86_CR4_MCE)
694 		set_in_cr4(X86_CR4_MCE);
695 }
696 
697 /*
698  * Old style boot options parsing. Only for compatibility.
699  */
mcheck_disable(char * str)700 static int __init mcheck_disable(char *str)
701 {
702 	mce_dont_init = 1;
703 	return 1;
704 }
705 
706 /* mce=off disables machine check. Note you can re-enable it later
707    using sysfs.
708    mce=TOLERANCELEVEL (number, see above)
709    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
710    mce=nobootlog Don't log MCEs from before booting. */
mcheck_enable(char * str)711 static int __init mcheck_enable(char *str)
712 {
713 	if (!strcmp(str, "off"))
714 		mce_dont_init = 1;
715 	else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
716 		mce_bootlog = str[0] == 'b';
717 	else if (isdigit(str[0]))
718 		get_option(&str, &tolerant);
719 	else
720 		printk("mce= argument %s ignored. Please use /sys", str);
721 	return 1;
722 }
723 
724 __setup("nomce", mcheck_disable);
725 __setup("mce=", mcheck_enable);
726 
727 /*
728  * Sysfs support
729  */
730 
731 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
732    Only one CPU is active at this time, the others get readded later using
733    CPU hotplug. */
mce_resume(struct sys_device * dev)734 static int mce_resume(struct sys_device *dev)
735 {
736 	mce_init(NULL);
737 	mce_cpu_features(&current_cpu_data);
738 	return 0;
739 }
740 
741 /* Reinit MCEs after user configuration changes */
mce_restart(void)742 static void mce_restart(void)
743 {
744 	if (next_interval)
745 		cancel_delayed_work(&mcheck_work);
746 	/* Timer race is harmless here */
747 	on_each_cpu(mce_init, NULL, 1);
748 	next_interval = check_interval * HZ;
749 	if (next_interval)
750 		schedule_delayed_work(&mcheck_work,
751 				      round_jiffies_relative(next_interval));
752 }
753 
754 static struct sysdev_class mce_sysclass = {
755 	.resume = mce_resume,
756 	.name = "machinecheck",
757 };
758 
759 DEFINE_PER_CPU(struct sys_device, device_mce);
760 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
761 
762 /* Why are there no generic functions for this? */
763 #define ACCESSOR(name, var, start) \
764 	static ssize_t show_ ## name(struct sys_device *s,		\
765 				     struct sysdev_attribute *attr,	\
766 				     char *buf) {			\
767 		return sprintf(buf, "%lx\n", (unsigned long)var);	\
768 	}								\
769 	static ssize_t set_ ## name(struct sys_device *s,		\
770 				    struct sysdev_attribute *attr,	\
771 				    const char *buf, size_t siz) {	\
772 		char *end;						\
773 		unsigned long new = simple_strtoul(buf, &end, 0);	\
774 		if (end == buf) return -EINVAL;				\
775 		var = new;						\
776 		start;							\
777 		return end-buf;						\
778 	}								\
779 	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
780 
781 /*
782  * TBD should generate these dynamically based on number of available banks.
783  * Have only 6 contol banks in /sysfs until then.
784  */
ACCESSOR(bank0ctl,bank[0],mce_restart ())785 ACCESSOR(bank0ctl,bank[0],mce_restart())
786 ACCESSOR(bank1ctl,bank[1],mce_restart())
787 ACCESSOR(bank2ctl,bank[2],mce_restart())
788 ACCESSOR(bank3ctl,bank[3],mce_restart())
789 ACCESSOR(bank4ctl,bank[4],mce_restart())
790 ACCESSOR(bank5ctl,bank[5],mce_restart())
791 
792 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
793 				char *buf)
794 {
795 	strcpy(buf, trigger);
796 	strcat(buf, "\n");
797 	return strlen(trigger) + 1;
798 }
799 
set_trigger(struct sys_device * s,struct sysdev_attribute * attr,const char * buf,size_t siz)800 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
801 				const char *buf,size_t siz)
802 {
803 	char *p;
804 	int len;
805 	strncpy(trigger, buf, sizeof(trigger));
806 	trigger[sizeof(trigger)-1] = 0;
807 	len = strlen(trigger);
808 	p = strchr(trigger, '\n');
809 	if (*p) *p = 0;
810 	return len;
811 }
812 
813 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
814 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
815 ACCESSOR(check_interval,check_interval,mce_restart())
816 static struct sysdev_attribute *mce_attributes[] = {
817 	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
818 	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
819 	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
820 	NULL
821 };
822 
823 static cpumask_t mce_device_initialized = CPU_MASK_NONE;
824 
825 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
mce_create_device(unsigned int cpu)826 static __cpuinit int mce_create_device(unsigned int cpu)
827 {
828 	int err;
829 	int i;
830 
831 	if (!mce_available(&boot_cpu_data))
832 		return -EIO;
833 
834 	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
835 	per_cpu(device_mce,cpu).id = cpu;
836 	per_cpu(device_mce,cpu).cls = &mce_sysclass;
837 
838 	err = sysdev_register(&per_cpu(device_mce,cpu));
839 	if (err)
840 		return err;
841 
842 	for (i = 0; mce_attributes[i]; i++) {
843 		err = sysdev_create_file(&per_cpu(device_mce,cpu),
844 					 mce_attributes[i]);
845 		if (err)
846 			goto error;
847 	}
848 	cpu_set(cpu, mce_device_initialized);
849 
850 	return 0;
851 error:
852 	while (i--) {
853 		sysdev_remove_file(&per_cpu(device_mce,cpu),
854 				   mce_attributes[i]);
855 	}
856 	sysdev_unregister(&per_cpu(device_mce,cpu));
857 
858 	return err;
859 }
860 
mce_remove_device(unsigned int cpu)861 static __cpuinit void mce_remove_device(unsigned int cpu)
862 {
863 	int i;
864 
865 	if (!cpu_isset(cpu, mce_device_initialized))
866 		return;
867 
868 	for (i = 0; mce_attributes[i]; i++)
869 		sysdev_remove_file(&per_cpu(device_mce,cpu),
870 			mce_attributes[i]);
871 	sysdev_unregister(&per_cpu(device_mce,cpu));
872 	cpu_clear(cpu, mce_device_initialized);
873 }
874 
875 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
mce_cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)876 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 				      unsigned long action, void *hcpu)
878 {
879 	unsigned int cpu = (unsigned long)hcpu;
880 
881 	switch (action) {
882 	case CPU_ONLINE:
883 	case CPU_ONLINE_FROZEN:
884 		mce_create_device(cpu);
885 		if (threshold_cpu_callback)
886 			threshold_cpu_callback(action, cpu);
887 		break;
888 	case CPU_DEAD:
889 	case CPU_DEAD_FROZEN:
890 		if (threshold_cpu_callback)
891 			threshold_cpu_callback(action, cpu);
892 		mce_remove_device(cpu);
893 		break;
894 	}
895 	return NOTIFY_OK;
896 }
897 
898 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
899 	.notifier_call = mce_cpu_callback,
900 };
901 
mce_init_device(void)902 static __init int mce_init_device(void)
903 {
904 	int err;
905 	int i = 0;
906 
907 	if (!mce_available(&boot_cpu_data))
908 		return -EIO;
909 	err = sysdev_class_register(&mce_sysclass);
910 	if (err)
911 		return err;
912 
913 	for_each_online_cpu(i) {
914 		err = mce_create_device(i);
915 		if (err)
916 			return err;
917 	}
918 
919 	register_hotcpu_notifier(&mce_cpu_notifier);
920 	misc_register(&mce_log_device);
921 	return err;
922 }
923 
924 device_initcall(mce_init_device);
925