1 // Copyright 2017 syzkaller project authors. All rights reserved.
2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
3
4 // This file is shared between executor and csource package.
5
6 // Implementation of syz_kvm_setup_cpu pseudo-syscall.
7 // See Intel Software Developer’s Manual Volume 3: System Programming Guide
8 // for details on what happens here.
9
10 #include "kvm.S.h"
11 #include "kvm.h"
12
13 #ifndef KVM_SMI
14 #define KVM_SMI _IO(KVMIO, 0xb7)
15 #endif
16
17 #define CR0_PE 1
18 #define CR0_MP (1 << 1)
19 #define CR0_EM (1 << 2)
20 #define CR0_TS (1 << 3)
21 #define CR0_ET (1 << 4)
22 #define CR0_NE (1 << 5)
23 #define CR0_WP (1 << 16)
24 #define CR0_AM (1 << 18)
25 #define CR0_NW (1 << 29)
26 #define CR0_CD (1 << 30)
27 #define CR0_PG (1 << 31)
28
29 #define CR4_VME 1
30 #define CR4_PVI (1 << 1)
31 #define CR4_TSD (1 << 2)
32 #define CR4_DE (1 << 3)
33 #define CR4_PSE (1 << 4)
34 #define CR4_PAE (1 << 5)
35 #define CR4_MCE (1 << 6)
36 #define CR4_PGE (1 << 7)
37 #define CR4_PCE (1 << 8)
38 #define CR4_OSFXSR (1 << 8)
39 #define CR4_OSXMMEXCPT (1 << 10)
40 #define CR4_UMIP (1 << 11)
41 #define CR4_VMXE (1 << 13)
42 #define CR4_SMXE (1 << 14)
43 #define CR4_FSGSBASE (1 << 16)
44 #define CR4_PCIDE (1 << 17)
45 #define CR4_OSXSAVE (1 << 18)
46 #define CR4_SMEP (1 << 20)
47 #define CR4_SMAP (1 << 21)
48 #define CR4_PKE (1 << 22)
49
50 #define EFER_SCE 1
51 #define EFER_LME (1 << 8)
52 #define EFER_LMA (1 << 10)
53 #define EFER_NXE (1 << 11)
54 #define EFER_SVME (1 << 12)
55 #define EFER_LMSLE (1 << 13)
56 #define EFER_FFXSR (1 << 14)
57 #define EFER_TCE (1 << 15)
58
59 // 32-bit page directory entry bits
60 #define PDE32_PRESENT 1
61 #define PDE32_RW (1 << 1)
62 #define PDE32_USER (1 << 2)
63 #define PDE32_PS (1 << 7)
64
65 // 64-bit page * entry bits
66 #define PDE64_PRESENT 1
67 #define PDE64_RW (1 << 1)
68 #define PDE64_USER (1 << 2)
69 #define PDE64_ACCESSED (1 << 5)
70 #define PDE64_DIRTY (1 << 6)
71 #define PDE64_PS (1 << 7)
72 #define PDE64_G (1 << 8)
73
74 struct tss16 {
75 uint16 prev;
76 uint16 sp0;
77 uint16 ss0;
78 uint16 sp1;
79 uint16 ss1;
80 uint16 sp2;
81 uint16 ss2;
82 uint16 ip;
83 uint16 flags;
84 uint16 ax;
85 uint16 cx;
86 uint16 dx;
87 uint16 bx;
88 uint16 sp;
89 uint16 bp;
90 uint16 si;
91 uint16 di;
92 uint16 es;
93 uint16 cs;
94 uint16 ss;
95 uint16 ds;
96 uint16 ldt;
97 } __attribute__((packed));
98
99 struct tss32 {
100 uint16 prev, prevh;
101 uint32 sp0;
102 uint16 ss0, ss0h;
103 uint32 sp1;
104 uint16 ss1, ss1h;
105 uint32 sp2;
106 uint16 ss2, ss2h;
107 uint32 cr3;
108 uint32 ip;
109 uint32 flags;
110 uint32 ax;
111 uint32 cx;
112 uint32 dx;
113 uint32 bx;
114 uint32 sp;
115 uint32 bp;
116 uint32 si;
117 uint32 di;
118 uint16 es, esh;
119 uint16 cs, csh;
120 uint16 ss, ssh;
121 uint16 ds, dsh;
122 uint16 fs, fsh;
123 uint16 gs, gsh;
124 uint16 ldt, ldth;
125 uint16 trace;
126 uint16 io_bitmap;
127 } __attribute__((packed));
128
129 struct tss64 {
130 uint32 reserved0;
131 uint64 rsp[3];
132 uint64 reserved1;
133 uint64 ist[7];
134 uint64 reserved2;
135 uint32 reserved3;
136 uint32 io_bitmap;
137 } __attribute__((packed));
138
fill_segment_descriptor(uint64 * dt,uint64 * lt,struct kvm_segment * seg)139 static void fill_segment_descriptor(uint64* dt, uint64* lt, struct kvm_segment* seg)
140 {
141 uint16 index = seg->selector >> 3;
142 uint64 limit = seg->g ? seg->limit >> 12 : seg->limit;
143 uint64 sd = (limit & 0xffff) | (seg->base & 0xffffff) << 16 | (uint64)seg->type << 40 | (uint64)seg->s << 44 | (uint64)seg->dpl << 45 | (uint64)seg->present << 47 | (limit & 0xf0000ULL) << 48 | (uint64)seg->avl << 52 | (uint64)seg->l << 53 | (uint64)seg->db << 54 | (uint64)seg->g << 55 | (seg->base & 0xff000000ULL) << 56;
144 NONFAILING(dt[index] = sd);
145 NONFAILING(lt[index] = sd);
146 }
147
fill_segment_descriptor_dword(uint64 * dt,uint64 * lt,struct kvm_segment * seg)148 static void fill_segment_descriptor_dword(uint64* dt, uint64* lt, struct kvm_segment* seg)
149 {
150 fill_segment_descriptor(dt, lt, seg);
151 uint16 index = seg->selector >> 3;
152 NONFAILING(dt[index + 1] = 0);
153 NONFAILING(lt[index + 1] = 0);
154 }
155
setup_syscall_msrs(int cpufd,uint16 sel_cs,uint16 sel_cs_cpl3)156 static void setup_syscall_msrs(int cpufd, uint16 sel_cs, uint16 sel_cs_cpl3)
157 {
158 char buf[sizeof(struct kvm_msrs) + 5 * sizeof(struct kvm_msr_entry)];
159 memset(buf, 0, sizeof(buf));
160 struct kvm_msrs* msrs = (struct kvm_msrs*)buf;
161 struct kvm_msr_entry* entries = msrs->entries;
162 msrs->nmsrs = 5;
163 entries[0].index = MSR_IA32_SYSENTER_CS;
164 entries[0].data = sel_cs;
165 entries[1].index = MSR_IA32_SYSENTER_ESP;
166 entries[1].data = ADDR_STACK0;
167 entries[2].index = MSR_IA32_SYSENTER_EIP;
168 entries[2].data = ADDR_VAR_SYSEXIT;
169 entries[3].index = MSR_IA32_STAR;
170 entries[3].data = ((uint64)sel_cs << 32) | ((uint64)sel_cs_cpl3 << 48);
171 entries[4].index = MSR_IA32_LSTAR;
172 entries[4].data = ADDR_VAR_SYSRET;
173 ioctl(cpufd, KVM_SET_MSRS, msrs);
174 }
175
setup_32bit_idt(struct kvm_sregs * sregs,char * host_mem,uintptr_t guest_mem)176 static void setup_32bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t guest_mem)
177 {
178 sregs->idt.base = guest_mem + ADDR_VAR_IDT;
179 sregs->idt.limit = 0x1ff;
180 uint64* idt = (uint64*)(host_mem + sregs->idt.base);
181 int i;
182 for (i = 0; i < 32; i++) {
183 struct kvm_segment gate;
184 gate.selector = i << 3;
185 switch (i % 6) {
186 case 0:
187 // 16-bit interrupt gate
188 gate.type = 6;
189 gate.base = SEL_CS16;
190 break;
191 case 1:
192 // 16-bit trap gate
193 gate.type = 7;
194 gate.base = SEL_CS16;
195 break;
196 case 2:
197 // 16-bit task gate
198 gate.type = 3;
199 gate.base = SEL_TGATE16;
200 break;
201 case 3:
202 // 32-bit interrupt gate
203 gate.type = 14;
204 gate.base = SEL_CS32;
205 break;
206 case 4:
207 // 32-bit trap gate
208 gate.type = 15;
209 gate.base = SEL_CS32;
210 break;
211 case 6:
212 // 32-bit task gate
213 gate.type = 11;
214 gate.base = SEL_TGATE32;
215 break;
216 }
217 gate.limit = guest_mem + ADDR_VAR_USER_CODE2; // entry offset
218 gate.present = 1;
219 gate.dpl = 0;
220 gate.s = 0;
221 gate.g = 0;
222 gate.db = 0;
223 gate.l = 0;
224 gate.avl = 0;
225 fill_segment_descriptor(idt, idt, &gate);
226 }
227 }
228
setup_64bit_idt(struct kvm_sregs * sregs,char * host_mem,uintptr_t guest_mem)229 static void setup_64bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t guest_mem)
230 {
231 sregs->idt.base = guest_mem + ADDR_VAR_IDT;
232 sregs->idt.limit = 0x1ff;
233 uint64* idt = (uint64*)(host_mem + sregs->idt.base);
234 int i;
235 for (i = 0; i < 32; i++) {
236 struct kvm_segment gate;
237 gate.selector = (i * 2) << 3;
238 gate.type = (i & 1) ? 14 : 15; // interrupt or trap gate
239 gate.base = SEL_CS64;
240 gate.limit = guest_mem + ADDR_VAR_USER_CODE2; // entry offset
241 gate.present = 1;
242 gate.dpl = 0;
243 gate.s = 0;
244 gate.g = 0;
245 gate.db = 0;
246 gate.l = 0;
247 gate.avl = 0;
248 fill_segment_descriptor_dword(idt, idt, &gate);
249 }
250 }
251
252 struct kvm_text {
253 uintptr_t typ;
254 const void* text;
255 uintptr_t size;
256 };
257
258 struct kvm_opt {
259 uint64 typ;
260 uint64 val;
261 };
262
263 #define KVM_SETUP_PAGING (1 << 0)
264 #define KVM_SETUP_PAE (1 << 1)
265 #define KVM_SETUP_PROTECTED (1 << 2)
266 #define KVM_SETUP_CPL3 (1 << 3)
267 #define KVM_SETUP_VIRT86 (1 << 4)
268 #define KVM_SETUP_SMM (1 << 5)
269 #define KVM_SETUP_VM (1 << 6)
270
271 // syz_kvm_setup_cpu(fd fd_kvmvm, cpufd fd_kvmcpu, usermem vma[24], text ptr[in, array[kvm_text, 1]], ntext len[text], flags flags[kvm_setup_flags], opts ptr[in, array[kvm_setup_opt, 0:2]], nopt len[opts])
syz_kvm_setup_cpu(uintptr_t a0,uintptr_t a1,uintptr_t a2,uintptr_t a3,uintptr_t a4,uintptr_t a5,uintptr_t a6,uintptr_t a7)272 static uintptr_t syz_kvm_setup_cpu(uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4, uintptr_t a5, uintptr_t a6, uintptr_t a7)
273 {
274 const int vmfd = a0;
275 const int cpufd = a1;
276 char* const host_mem = (char*)a2;
277 const struct kvm_text* const text_array_ptr = (struct kvm_text*)a3;
278 const uintptr_t text_count = a4;
279 const uintptr_t flags = a5;
280 const struct kvm_opt* const opt_array_ptr = (struct kvm_opt*)a6;
281 uintptr_t opt_count = a7;
282
283 const uintptr_t page_size = 4 << 10;
284 const uintptr_t ioapic_page = 10;
285 const uintptr_t guest_mem_size = 24 * page_size;
286 const uintptr_t guest_mem = 0;
287
288 (void)text_count; // fuzzer can spoof count and we need just 1 text, so ignore text_count
289 int text_type = 0;
290 const void* text = 0;
291 uintptr_t text_size = 0;
292 NONFAILING(text_type = text_array_ptr[0].typ);
293 NONFAILING(text = text_array_ptr[0].text);
294 NONFAILING(text_size = text_array_ptr[0].size);
295
296 uintptr_t i;
297 for (i = 0; i < guest_mem_size / page_size; i++) {
298 struct kvm_userspace_memory_region memreg;
299 memreg.slot = i;
300 memreg.flags = 0; // can be KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY
301 memreg.guest_phys_addr = guest_mem + i * page_size;
302 if (i == ioapic_page)
303 memreg.guest_phys_addr = 0xfec00000;
304 memreg.memory_size = page_size;
305 memreg.userspace_addr = (uintptr_t)host_mem + i * page_size;
306 ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg);
307 }
308 // SMRAM
309 struct kvm_userspace_memory_region memreg;
310 memreg.slot = 1 + (1 << 16);
311 memreg.flags = 0;
312 memreg.guest_phys_addr = 0x30000;
313 memreg.memory_size = 64 << 10;
314 memreg.userspace_addr = (uintptr_t)host_mem;
315 ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg);
316
317 struct kvm_sregs sregs;
318 if (ioctl(cpufd, KVM_GET_SREGS, &sregs))
319 return -1;
320
321 struct kvm_regs regs;
322 memset(®s, 0, sizeof(regs));
323 regs.rip = guest_mem + ADDR_TEXT;
324 regs.rsp = ADDR_STACK0;
325
326 sregs.gdt.base = guest_mem + ADDR_GDT;
327 sregs.gdt.limit = 256 * sizeof(uint64) - 1;
328 uint64* gdt = (uint64*)(host_mem + sregs.gdt.base);
329
330 struct kvm_segment seg_ldt;
331 seg_ldt.selector = SEL_LDT;
332 seg_ldt.type = 2;
333 seg_ldt.base = guest_mem + ADDR_LDT;
334 seg_ldt.limit = 256 * sizeof(uint64) - 1;
335 seg_ldt.present = 1;
336 seg_ldt.dpl = 0;
337 seg_ldt.s = 0;
338 seg_ldt.g = 0;
339 seg_ldt.db = 1;
340 seg_ldt.l = 0;
341 sregs.ldt = seg_ldt;
342 uint64* ldt = (uint64*)(host_mem + sregs.ldt.base);
343
344 struct kvm_segment seg_cs16;
345 seg_cs16.selector = SEL_CS16;
346 seg_cs16.type = 11;
347 seg_cs16.base = 0;
348 seg_cs16.limit = 0xfffff;
349 seg_cs16.present = 1;
350 seg_cs16.dpl = 0;
351 seg_cs16.s = 1;
352 seg_cs16.g = 0;
353 seg_cs16.db = 0;
354 seg_cs16.l = 0;
355
356 struct kvm_segment seg_ds16 = seg_cs16;
357 seg_ds16.selector = SEL_DS16;
358 seg_ds16.type = 3;
359
360 struct kvm_segment seg_cs16_cpl3 = seg_cs16;
361 seg_cs16_cpl3.selector = SEL_CS16_CPL3;
362 seg_cs16_cpl3.dpl = 3;
363
364 struct kvm_segment seg_ds16_cpl3 = seg_ds16;
365 seg_ds16_cpl3.selector = SEL_DS16_CPL3;
366 seg_ds16_cpl3.dpl = 3;
367
368 struct kvm_segment seg_cs32 = seg_cs16;
369 seg_cs32.selector = SEL_CS32;
370 seg_cs32.db = 1;
371
372 struct kvm_segment seg_ds32 = seg_ds16;
373 seg_ds32.selector = SEL_DS32;
374 seg_ds32.db = 1;
375
376 struct kvm_segment seg_cs32_cpl3 = seg_cs32;
377 seg_cs32_cpl3.selector = SEL_CS32_CPL3;
378 seg_cs32_cpl3.dpl = 3;
379
380 struct kvm_segment seg_ds32_cpl3 = seg_ds32;
381 seg_ds32_cpl3.selector = SEL_DS32_CPL3;
382 seg_ds32_cpl3.dpl = 3;
383
384 struct kvm_segment seg_cs64 = seg_cs16;
385 seg_cs64.selector = SEL_CS64;
386 seg_cs64.l = 1;
387
388 struct kvm_segment seg_ds64 = seg_ds32;
389 seg_ds64.selector = SEL_DS64;
390
391 struct kvm_segment seg_cs64_cpl3 = seg_cs64;
392 seg_cs64_cpl3.selector = SEL_CS64_CPL3;
393 seg_cs64_cpl3.dpl = 3;
394
395 struct kvm_segment seg_ds64_cpl3 = seg_ds64;
396 seg_ds64_cpl3.selector = SEL_DS64_CPL3;
397 seg_ds64_cpl3.dpl = 3;
398
399 struct kvm_segment seg_tss32;
400 seg_tss32.selector = SEL_TSS32;
401 seg_tss32.type = 9;
402 seg_tss32.base = ADDR_VAR_TSS32;
403 seg_tss32.limit = 0x1ff;
404 seg_tss32.present = 1;
405 seg_tss32.dpl = 0;
406 seg_tss32.s = 0;
407 seg_tss32.g = 0;
408 seg_tss32.db = 0;
409 seg_tss32.l = 0;
410
411 struct kvm_segment seg_tss32_2 = seg_tss32;
412 seg_tss32_2.selector = SEL_TSS32_2;
413 seg_tss32_2.base = ADDR_VAR_TSS32_2;
414
415 struct kvm_segment seg_tss32_cpl3 = seg_tss32;
416 seg_tss32_cpl3.selector = SEL_TSS32_CPL3;
417 seg_tss32_cpl3.base = ADDR_VAR_TSS32_CPL3;
418
419 struct kvm_segment seg_tss32_vm86 = seg_tss32;
420 seg_tss32_vm86.selector = SEL_TSS32_VM86;
421 seg_tss32_vm86.base = ADDR_VAR_TSS32_VM86;
422
423 struct kvm_segment seg_tss16 = seg_tss32;
424 seg_tss16.selector = SEL_TSS16;
425 seg_tss16.base = ADDR_VAR_TSS16;
426 seg_tss16.limit = 0xff;
427 seg_tss16.type = 1;
428
429 struct kvm_segment seg_tss16_2 = seg_tss16;
430 seg_tss16_2.selector = SEL_TSS16_2;
431 seg_tss16_2.base = ADDR_VAR_TSS16_2;
432 seg_tss16_2.dpl = 0;
433
434 struct kvm_segment seg_tss16_cpl3 = seg_tss16;
435 seg_tss16_cpl3.selector = SEL_TSS16_CPL3;
436 seg_tss16_cpl3.base = ADDR_VAR_TSS16_CPL3;
437 seg_tss16_cpl3.dpl = 3;
438
439 struct kvm_segment seg_tss64 = seg_tss32;
440 seg_tss64.selector = SEL_TSS64;
441 seg_tss64.base = ADDR_VAR_TSS64;
442 seg_tss64.limit = 0x1ff;
443
444 struct kvm_segment seg_tss64_cpl3 = seg_tss64;
445 seg_tss64_cpl3.selector = SEL_TSS64_CPL3;
446 seg_tss64_cpl3.base = ADDR_VAR_TSS64_CPL3;
447 seg_tss64_cpl3.dpl = 3;
448
449 struct kvm_segment seg_cgate16;
450 seg_cgate16.selector = SEL_CGATE16;
451 seg_cgate16.type = 4;
452 seg_cgate16.base = SEL_CS16 | (2 << 16); // selector + param count
453 seg_cgate16.limit = ADDR_VAR_USER_CODE2; // entry offset
454 seg_cgate16.present = 1;
455 seg_cgate16.dpl = 0;
456 seg_cgate16.s = 0;
457 seg_cgate16.g = 0;
458 seg_cgate16.db = 0;
459 seg_cgate16.l = 0;
460 seg_cgate16.avl = 0;
461
462 struct kvm_segment seg_tgate16 = seg_cgate16;
463 seg_tgate16.selector = SEL_TGATE16;
464 seg_tgate16.type = 3;
465 seg_cgate16.base = SEL_TSS16_2;
466 seg_tgate16.limit = 0;
467
468 struct kvm_segment seg_cgate32 = seg_cgate16;
469 seg_cgate32.selector = SEL_CGATE32;
470 seg_cgate32.type = 12;
471 seg_cgate32.base = SEL_CS32 | (2 << 16); // selector + param count
472
473 struct kvm_segment seg_tgate32 = seg_cgate32;
474 seg_tgate32.selector = SEL_TGATE32;
475 seg_tgate32.type = 11;
476 seg_tgate32.base = SEL_TSS32_2;
477 seg_tgate32.limit = 0;
478
479 struct kvm_segment seg_cgate64 = seg_cgate16;
480 seg_cgate64.selector = SEL_CGATE64;
481 seg_cgate64.type = 12;
482 seg_cgate64.base = SEL_CS64;
483
484 int kvmfd = open("/dev/kvm", O_RDWR);
485 char buf[sizeof(struct kvm_cpuid2) + 128 * sizeof(struct kvm_cpuid_entry2)];
486 memset(buf, 0, sizeof(buf));
487 struct kvm_cpuid2* cpuid = (struct kvm_cpuid2*)buf;
488 cpuid->nent = 128;
489 ioctl(kvmfd, KVM_GET_SUPPORTED_CPUID, cpuid);
490 ioctl(cpufd, KVM_SET_CPUID2, cpuid);
491 close(kvmfd);
492
493 const char* text_prefix = 0;
494 int text_prefix_size = 0;
495 char* host_text = host_mem + ADDR_TEXT;
496
497 if (text_type == 8) {
498 if (flags & KVM_SETUP_SMM) {
499 if (flags & KVM_SETUP_PROTECTED) {
500 sregs.cs = seg_cs16;
501 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16;
502 sregs.cr0 |= CR0_PE;
503 } else {
504 sregs.cs.selector = 0;
505 sregs.cs.base = 0;
506 }
507
508 NONFAILING(*(host_mem + ADDR_TEXT) = 0xf4); // hlt for rsm
509 host_text = host_mem + 0x8000;
510
511 ioctl(cpufd, KVM_SMI, 0);
512 } else if (flags & KVM_SETUP_VIRT86) {
513 sregs.cs = seg_cs32;
514 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
515 sregs.cr0 |= CR0_PE;
516 sregs.efer |= EFER_SCE;
517
518 setup_syscall_msrs(cpufd, SEL_CS32, SEL_CS32_CPL3);
519 setup_32bit_idt(&sregs, host_mem, guest_mem);
520
521 if (flags & KVM_SETUP_PAGING) {
522 uint64 pd_addr = guest_mem + ADDR_PD;
523 uint64* pd = (uint64*)(host_mem + ADDR_PD);
524 // A single 4MB page to cover the memory region
525 NONFAILING(pd[0] = PDE32_PRESENT | PDE32_RW | PDE32_USER | PDE32_PS);
526 sregs.cr3 = pd_addr;
527 sregs.cr4 |= CR4_PSE;
528
529 text_prefix = kvm_asm32_paged_vm86;
530 text_prefix_size = sizeof(kvm_asm32_paged_vm86) - 1;
531 } else {
532 text_prefix = kvm_asm32_vm86;
533 text_prefix_size = sizeof(kvm_asm32_vm86) - 1;
534 }
535 } else {
536 sregs.cs.selector = 0;
537 sregs.cs.base = 0;
538 }
539 } else if (text_type == 16) {
540 if (flags & KVM_SETUP_CPL3) {
541 sregs.cs = seg_cs16;
542 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16;
543
544 text_prefix = kvm_asm16_cpl3;
545 text_prefix_size = sizeof(kvm_asm16_cpl3) - 1;
546 } else {
547 sregs.cr0 |= CR0_PE;
548 sregs.cs = seg_cs16;
549 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds16;
550 }
551 } else if (text_type == 32) {
552 sregs.cr0 |= CR0_PE;
553 sregs.efer |= EFER_SCE;
554
555 setup_syscall_msrs(cpufd, SEL_CS32, SEL_CS32_CPL3);
556 setup_32bit_idt(&sregs, host_mem, guest_mem);
557
558 if (flags & KVM_SETUP_SMM) {
559 sregs.cs = seg_cs32;
560 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
561
562 NONFAILING(*(host_mem + ADDR_TEXT) = 0xf4); // hlt for rsm
563 host_text = host_mem + 0x8000;
564
565 ioctl(cpufd, KVM_SMI, 0);
566 } else if (flags & KVM_SETUP_PAGING) {
567 sregs.cs = seg_cs32;
568 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
569
570 uint64 pd_addr = guest_mem + ADDR_PD;
571 uint64* pd = (uint64*)(host_mem + ADDR_PD);
572 // A single 4MB page to cover the memory region
573 NONFAILING(pd[0] = PDE32_PRESENT | PDE32_RW | PDE32_USER | PDE32_PS);
574 sregs.cr3 = pd_addr;
575 sregs.cr4 |= CR4_PSE;
576
577 text_prefix = kvm_asm32_paged;
578 text_prefix_size = sizeof(kvm_asm32_paged) - 1;
579 } else if (flags & KVM_SETUP_CPL3) {
580 sregs.cs = seg_cs32_cpl3;
581 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32_cpl3;
582 } else {
583 sregs.cs = seg_cs32;
584 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
585 }
586 } else {
587 sregs.efer |= EFER_LME | EFER_SCE;
588 sregs.cr0 |= CR0_PE;
589
590 setup_syscall_msrs(cpufd, SEL_CS64, SEL_CS64_CPL3);
591 setup_64bit_idt(&sregs, host_mem, guest_mem);
592
593 sregs.cs = seg_cs32;
594 sregs.ds = sregs.es = sregs.fs = sregs.gs = sregs.ss = seg_ds32;
595
596 uint64 pml4_addr = guest_mem + ADDR_PML4;
597 uint64* pml4 = (uint64*)(host_mem + ADDR_PML4);
598 uint64 pdpt_addr = guest_mem + ADDR_PDP;
599 uint64* pdpt = (uint64*)(host_mem + ADDR_PDP);
600 uint64 pd_addr = guest_mem + ADDR_PD;
601 uint64* pd = (uint64*)(host_mem + ADDR_PD);
602 NONFAILING(pml4[0] = PDE64_PRESENT | PDE64_RW | PDE64_USER | pdpt_addr);
603 NONFAILING(pdpt[0] = PDE64_PRESENT | PDE64_RW | PDE64_USER | pd_addr);
604 NONFAILING(pd[0] = PDE64_PRESENT | PDE64_RW | PDE64_USER | PDE64_PS);
605 sregs.cr3 = pml4_addr;
606 sregs.cr4 |= CR4_PAE;
607
608 if (flags & KVM_SETUP_VM) {
609 sregs.cr0 |= CR0_NE;
610
611 NONFAILING(*((uint64*)(host_mem + ADDR_VAR_VMXON_PTR)) = ADDR_VAR_VMXON);
612 NONFAILING(*((uint64*)(host_mem + ADDR_VAR_VMCS_PTR)) = ADDR_VAR_VMCS);
613 NONFAILING(memcpy(host_mem + ADDR_VAR_VMEXIT_CODE, kvm_asm64_vm_exit, sizeof(kvm_asm64_vm_exit) - 1));
614 NONFAILING(*((uint64*)(host_mem + ADDR_VAR_VMEXIT_PTR)) = ADDR_VAR_VMEXIT_CODE);
615
616 text_prefix = kvm_asm64_init_vm;
617 text_prefix_size = sizeof(kvm_asm64_init_vm) - 1;
618 } else if (flags & KVM_SETUP_CPL3) {
619 text_prefix = kvm_asm64_cpl3;
620 text_prefix_size = sizeof(kvm_asm64_cpl3) - 1;
621 } else {
622 text_prefix = kvm_asm64_enable_long;
623 text_prefix_size = sizeof(kvm_asm64_enable_long) - 1;
624 }
625 }
626
627 struct tss16 tss16;
628 memset(&tss16, 0, sizeof(tss16));
629 tss16.ss0 = tss16.ss1 = tss16.ss2 = SEL_DS16;
630 tss16.sp0 = tss16.sp1 = tss16.sp2 = ADDR_STACK0;
631 tss16.ip = ADDR_VAR_USER_CODE2;
632 tss16.flags = (1 << 1);
633 tss16.cs = SEL_CS16;
634 tss16.es = tss16.ds = tss16.ss = SEL_DS16;
635 tss16.ldt = SEL_LDT;
636 struct tss16* tss16_addr = (struct tss16*)(host_mem + seg_tss16_2.base);
637 NONFAILING(memcpy(tss16_addr, &tss16, sizeof(tss16)));
638
639 memset(&tss16, 0, sizeof(tss16));
640 tss16.ss0 = tss16.ss1 = tss16.ss2 = SEL_DS16;
641 tss16.sp0 = tss16.sp1 = tss16.sp2 = ADDR_STACK0;
642 tss16.ip = ADDR_VAR_USER_CODE2;
643 tss16.flags = (1 << 1);
644 tss16.cs = SEL_CS16_CPL3;
645 tss16.es = tss16.ds = tss16.ss = SEL_DS16_CPL3;
646 tss16.ldt = SEL_LDT;
647 struct tss16* tss16_cpl3_addr = (struct tss16*)(host_mem + seg_tss16_cpl3.base);
648 NONFAILING(memcpy(tss16_cpl3_addr, &tss16, sizeof(tss16)));
649
650 struct tss32 tss32;
651 memset(&tss32, 0, sizeof(tss32));
652 tss32.ss0 = tss32.ss1 = tss32.ss2 = SEL_DS32;
653 tss32.sp0 = tss32.sp1 = tss32.sp2 = ADDR_STACK0;
654 tss32.ip = ADDR_VAR_USER_CODE;
655 tss32.flags = (1 << 1) | (1 << 17);
656 tss32.ldt = SEL_LDT;
657 tss32.cr3 = sregs.cr3;
658 tss32.io_bitmap = offsetof(struct tss32, io_bitmap);
659 struct tss32* tss32_addr = (struct tss32*)(host_mem + seg_tss32_vm86.base);
660 NONFAILING(memcpy(tss32_addr, &tss32, sizeof(tss32)));
661
662 memset(&tss32, 0, sizeof(tss32));
663 tss32.ss0 = tss32.ss1 = tss32.ss2 = SEL_DS32;
664 tss32.sp0 = tss32.sp1 = tss32.sp2 = ADDR_STACK0;
665 tss32.ip = ADDR_VAR_USER_CODE;
666 tss32.flags = (1 << 1);
667 tss32.cr3 = sregs.cr3;
668 tss32.es = tss32.ds = tss32.ss = tss32.gs = tss32.fs = SEL_DS32;
669 tss32.cs = SEL_CS32;
670 tss32.ldt = SEL_LDT;
671 tss32.cr3 = sregs.cr3;
672 tss32.io_bitmap = offsetof(struct tss32, io_bitmap);
673 struct tss32* tss32_cpl3_addr = (struct tss32*)(host_mem + seg_tss32_2.base);
674 NONFAILING(memcpy(tss32_cpl3_addr, &tss32, sizeof(tss32)));
675
676 struct tss64 tss64;
677 memset(&tss64, 0, sizeof(tss64));
678 tss64.rsp[0] = ADDR_STACK0;
679 tss64.rsp[1] = ADDR_STACK0;
680 tss64.rsp[2] = ADDR_STACK0;
681 tss64.io_bitmap = offsetof(struct tss64, io_bitmap);
682 struct tss64* tss64_addr = (struct tss64*)(host_mem + seg_tss64.base);
683 NONFAILING(memcpy(tss64_addr, &tss64, sizeof(tss64)));
684
685 memset(&tss64, 0, sizeof(tss64));
686 tss64.rsp[0] = ADDR_STACK0;
687 tss64.rsp[1] = ADDR_STACK0;
688 tss64.rsp[2] = ADDR_STACK0;
689 tss64.io_bitmap = offsetof(struct tss64, io_bitmap);
690 struct tss64* tss64_cpl3_addr = (struct tss64*)(host_mem + seg_tss64_cpl3.base);
691 NONFAILING(memcpy(tss64_cpl3_addr, &tss64, sizeof(tss64)));
692
693 if (text_size > 1000)
694 text_size = 1000;
695 if (text_prefix) {
696 NONFAILING(memcpy(host_text, text_prefix, text_prefix_size));
697 void* patch = 0;
698 // Replace 0xbadc0de in LJMP with offset of a next instruction.
699 NONFAILING(patch = memmem(host_text, text_prefix_size, "\xde\xc0\xad\x0b", 4));
700 if (patch)
701 NONFAILING(*((uint32*)patch) = guest_mem + ADDR_TEXT + ((char*)patch - host_text) + 6);
702 uint16 magic = PREFIX_SIZE;
703 patch = 0;
704 NONFAILING(patch = memmem(host_text, text_prefix_size, &magic, sizeof(magic)));
705 if (patch)
706 NONFAILING(*((uint16*)patch) = guest_mem + ADDR_TEXT + text_prefix_size);
707 }
708 NONFAILING(memcpy((void*)(host_text + text_prefix_size), text, text_size));
709 NONFAILING(*(host_text + text_prefix_size + text_size) = 0xf4); // hlt
710
711 NONFAILING(memcpy(host_mem + ADDR_VAR_USER_CODE, text, text_size));
712 NONFAILING(*(host_mem + ADDR_VAR_USER_CODE + text_size) = 0xf4); // hlt
713
714 NONFAILING(*(host_mem + ADDR_VAR_HLT) = 0xf4); // hlt
715 NONFAILING(memcpy(host_mem + ADDR_VAR_SYSRET, "\x0f\x07\xf4", 3));
716 NONFAILING(memcpy(host_mem + ADDR_VAR_SYSEXIT, "\x0f\x35\xf4", 3));
717
718 NONFAILING(*(uint64*)(host_mem + ADDR_VAR_VMWRITE_FLD) = 0);
719 NONFAILING(*(uint64*)(host_mem + ADDR_VAR_VMWRITE_VAL) = 0);
720
721 if (opt_count > 2)
722 opt_count = 2;
723 for (i = 0; i < opt_count; i++) {
724 uint64 typ = 0;
725 uint64 val = 0;
726 NONFAILING(typ = opt_array_ptr[i].typ);
727 NONFAILING(val = opt_array_ptr[i].val);
728 switch (typ % 9) {
729 case 0:
730 sregs.cr0 ^= val & (CR0_MP | CR0_EM | CR0_ET | CR0_NE | CR0_WP | CR0_AM | CR0_NW | CR0_CD);
731 break;
732 case 1:
733 sregs.cr4 ^= val & (CR4_VME | CR4_PVI | CR4_TSD | CR4_DE | CR4_MCE | CR4_PGE | CR4_PCE |
734 CR4_OSFXSR | CR4_OSXMMEXCPT | CR4_UMIP | CR4_VMXE | CR4_SMXE | CR4_FSGSBASE | CR4_PCIDE |
735 CR4_OSXSAVE | CR4_SMEP | CR4_SMAP | CR4_PKE);
736 break;
737 case 2:
738 sregs.efer ^= val & (EFER_SCE | EFER_NXE | EFER_SVME | EFER_LMSLE | EFER_FFXSR | EFER_TCE);
739 break;
740 case 3:
741 val &= ((1 << 8) | (1 << 9) | (1 << 10) | (1 << 12) | (1 << 13) | (1 << 14) |
742 (1 << 15) | (1 << 18) | (1 << 19) | (1 << 20) | (1 << 21));
743 regs.rflags ^= val;
744 NONFAILING(tss16_addr->flags ^= val);
745 NONFAILING(tss16_cpl3_addr->flags ^= val);
746 NONFAILING(tss32_addr->flags ^= val);
747 NONFAILING(tss32_cpl3_addr->flags ^= val);
748 break;
749 case 4:
750 seg_cs16.type = val & 0xf;
751 seg_cs32.type = val & 0xf;
752 seg_cs64.type = val & 0xf;
753 break;
754 case 5:
755 seg_cs16_cpl3.type = val & 0xf;
756 seg_cs32_cpl3.type = val & 0xf;
757 seg_cs64_cpl3.type = val & 0xf;
758 break;
759 case 6:
760 seg_ds16.type = val & 0xf;
761 seg_ds32.type = val & 0xf;
762 seg_ds64.type = val & 0xf;
763 break;
764 case 7:
765 seg_ds16_cpl3.type = val & 0xf;
766 seg_ds32_cpl3.type = val & 0xf;
767 seg_ds64_cpl3.type = val & 0xf;
768 break;
769 case 8:
770 NONFAILING(*(uint64*)(host_mem + ADDR_VAR_VMWRITE_FLD) = (val & 0xffff));
771 NONFAILING(*(uint64*)(host_mem + ADDR_VAR_VMWRITE_VAL) = (val >> 16));
772 break;
773 default:
774 fail("bad kvm setup opt");
775 }
776 }
777 regs.rflags |= 2; // bit 1 is always set
778
779 fill_segment_descriptor(gdt, ldt, &seg_ldt);
780 fill_segment_descriptor(gdt, ldt, &seg_cs16);
781 fill_segment_descriptor(gdt, ldt, &seg_ds16);
782 fill_segment_descriptor(gdt, ldt, &seg_cs16_cpl3);
783 fill_segment_descriptor(gdt, ldt, &seg_ds16_cpl3);
784 fill_segment_descriptor(gdt, ldt, &seg_cs32);
785 fill_segment_descriptor(gdt, ldt, &seg_ds32);
786 fill_segment_descriptor(gdt, ldt, &seg_cs32_cpl3);
787 fill_segment_descriptor(gdt, ldt, &seg_ds32_cpl3);
788 fill_segment_descriptor(gdt, ldt, &seg_cs64);
789 fill_segment_descriptor(gdt, ldt, &seg_ds64);
790 fill_segment_descriptor(gdt, ldt, &seg_cs64_cpl3);
791 fill_segment_descriptor(gdt, ldt, &seg_ds64_cpl3);
792 fill_segment_descriptor(gdt, ldt, &seg_tss32);
793 fill_segment_descriptor(gdt, ldt, &seg_tss32_2);
794 fill_segment_descriptor(gdt, ldt, &seg_tss32_cpl3);
795 fill_segment_descriptor(gdt, ldt, &seg_tss32_vm86);
796 fill_segment_descriptor(gdt, ldt, &seg_tss16);
797 fill_segment_descriptor(gdt, ldt, &seg_tss16_2);
798 fill_segment_descriptor(gdt, ldt, &seg_tss16_cpl3);
799 fill_segment_descriptor_dword(gdt, ldt, &seg_tss64);
800 fill_segment_descriptor_dword(gdt, ldt, &seg_tss64_cpl3);
801 fill_segment_descriptor(gdt, ldt, &seg_cgate16);
802 fill_segment_descriptor(gdt, ldt, &seg_tgate16);
803 fill_segment_descriptor(gdt, ldt, &seg_cgate32);
804 fill_segment_descriptor(gdt, ldt, &seg_tgate32);
805 fill_segment_descriptor_dword(gdt, ldt, &seg_cgate64);
806
807 if (ioctl(cpufd, KVM_SET_SREGS, &sregs))
808 return -1;
809 if (ioctl(cpufd, KVM_SET_REGS, ®s))
810 return -1;
811 return 0;
812 }
813