1 #define pr_fmt(fmt) "SVM: " fmt
2
3 #include <linux/kvm_host.h>
4
5 #include "irq.h"
6 #include "mmu.h"
7 #include "kvm_cache_regs.h"
8 #include "x86.h"
9 #include "cpuid.h"
10 #include "pmu.h"
11
12 #include <linux/module.h>
13 #include <linux/mod_devicetable.h>
14 #include <linux/kernel.h>
15 #include <linux/vmalloc.h>
16 #include <linux/highmem.h>
17 #include <linux/amd-iommu.h>
18 #include <linux/sched.h>
19 #include <linux/trace_events.h>
20 #include <linux/slab.h>
21 #include <linux/hashtable.h>
22 #include <linux/objtool.h>
23 #include <linux/psp-sev.h>
24 #include <linux/file.h>
25 #include <linux/pagemap.h>
26 #include <linux/swap.h>
27 #include <linux/rwsem.h>
28
29 #include <asm/apic.h>
30 #include <asm/perf_event.h>
31 #include <asm/tlbflush.h>
32 #include <asm/desc.h>
33 #include <asm/debugreg.h>
34 #include <asm/kvm_para.h>
35 #include <asm/irq_remapping.h>
36 #include <asm/mce.h>
37 #include <asm/spec-ctrl.h>
38 #include <asm/cpu_device_id.h>
39
40 #include <asm/virtext.h>
41 #include "trace.h"
42
43 #include "svm.h"
44
45 #define __ex(x) __kvm_handle_fault_on_reboot(x)
46
47 MODULE_AUTHOR("Qumranet");
48 MODULE_LICENSE("GPL");
49
50 #ifdef MODULE
51 static const struct x86_cpu_id svm_cpu_id[] = {
52 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
53 {}
54 };
55 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
56 #endif
57
58 #define IOPM_ALLOC_ORDER 2
59 #define MSRPM_ALLOC_ORDER 1
60
61 #define SEG_TYPE_LDT 2
62 #define SEG_TYPE_BUSY_TSS16 3
63
64 #define SVM_FEATURE_LBRV (1 << 1)
65 #define SVM_FEATURE_SVML (1 << 2)
66 #define SVM_FEATURE_TSC_RATE (1 << 4)
67 #define SVM_FEATURE_VMCB_CLEAN (1 << 5)
68 #define SVM_FEATURE_FLUSH_ASID (1 << 6)
69 #define SVM_FEATURE_DECODE_ASSIST (1 << 7)
70 #define SVM_FEATURE_PAUSE_FILTER (1 << 10)
71
72 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
73
74 #define TSC_RATIO_RSVD 0xffffff0000000000ULL
75 #define TSC_RATIO_MIN 0x0000000000000001ULL
76 #define TSC_RATIO_MAX 0x000000ffffffffffULL
77
78 static bool erratum_383_found __read_mostly;
79
80 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
81
82 /*
83 * Set osvw_len to higher value when updated Revision Guides
84 * are published and we know what the new status bits are
85 */
86 static uint64_t osvw_len = 4, osvw_status;
87
88 static DEFINE_PER_CPU(u64, current_tsc_ratio);
89 #define TSC_RATIO_DEFAULT 0x0100000000ULL
90
91 static const struct svm_direct_access_msrs {
92 u32 index; /* Index of the MSR */
93 bool always; /* True if intercept is always on */
94 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
95 { .index = MSR_STAR, .always = true },
96 { .index = MSR_IA32_SYSENTER_CS, .always = true },
97 #ifdef CONFIG_X86_64
98 { .index = MSR_GS_BASE, .always = true },
99 { .index = MSR_FS_BASE, .always = true },
100 { .index = MSR_KERNEL_GS_BASE, .always = true },
101 { .index = MSR_LSTAR, .always = true },
102 { .index = MSR_CSTAR, .always = true },
103 { .index = MSR_SYSCALL_MASK, .always = true },
104 #endif
105 { .index = MSR_IA32_SPEC_CTRL, .always = false },
106 { .index = MSR_IA32_PRED_CMD, .always = false },
107 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
108 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
109 { .index = MSR_IA32_LASTINTFROMIP, .always = false },
110 { .index = MSR_IA32_LASTINTTOIP, .always = false },
111 { .index = MSR_INVALID, .always = false },
112 };
113
114 /* enable NPT for AMD64 and X86 with PAE */
115 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
116 bool npt_enabled = true;
117 #else
118 bool npt_enabled;
119 #endif
120
121 /*
122 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
123 * pause_filter_count: On processors that support Pause filtering(indicated
124 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
125 * count value. On VMRUN this value is loaded into an internal counter.
126 * Each time a pause instruction is executed, this counter is decremented
127 * until it reaches zero at which time a #VMEXIT is generated if pause
128 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
129 * Intercept Filtering for more details.
130 * This also indicate if ple logic enabled.
131 *
132 * pause_filter_thresh: In addition, some processor families support advanced
133 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
134 * the amount of time a guest is allowed to execute in a pause loop.
135 * In this mode, a 16-bit pause filter threshold field is added in the
136 * VMCB. The threshold value is a cycle count that is used to reset the
137 * pause counter. As with simple pause filtering, VMRUN loads the pause
138 * count value from VMCB into an internal counter. Then, on each pause
139 * instruction the hardware checks the elapsed number of cycles since
140 * the most recent pause instruction against the pause filter threshold.
141 * If the elapsed cycle count is greater than the pause filter threshold,
142 * then the internal pause count is reloaded from the VMCB and execution
143 * continues. If the elapsed cycle count is less than the pause filter
144 * threshold, then the internal pause count is decremented. If the count
145 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
146 * triggered. If advanced pause filtering is supported and pause filter
147 * threshold field is set to zero, the filter will operate in the simpler,
148 * count only mode.
149 */
150
151 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
152 module_param(pause_filter_thresh, ushort, 0444);
153
154 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
155 module_param(pause_filter_count, ushort, 0444);
156
157 /* Default doubles per-vcpu window every exit. */
158 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
159 module_param(pause_filter_count_grow, ushort, 0444);
160
161 /* Default resets per-vcpu window every exit to pause_filter_count. */
162 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
163 module_param(pause_filter_count_shrink, ushort, 0444);
164
165 /* Default is to compute the maximum so we can never overflow. */
166 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
167 module_param(pause_filter_count_max, ushort, 0444);
168
169 /* allow nested paging (virtualized MMU) for all guests */
170 static int npt = true;
171 module_param(npt, int, S_IRUGO);
172
173 /* allow nested virtualization in KVM/SVM */
174 static int nested = true;
175 module_param(nested, int, S_IRUGO);
176
177 /* enable/disable Next RIP Save */
178 static int nrips = true;
179 module_param(nrips, int, 0444);
180
181 /* enable/disable Virtual VMLOAD VMSAVE */
182 static int vls = true;
183 module_param(vls, int, 0444);
184
185 /* enable/disable Virtual GIF */
186 static int vgif = true;
187 module_param(vgif, int, 0444);
188
189 /* enable/disable SEV support */
190 static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
191 module_param(sev, int, 0444);
192
193 static bool __read_mostly dump_invalid_vmcb = 0;
194 module_param(dump_invalid_vmcb, bool, 0644);
195
196 static u8 rsm_ins_bytes[] = "\x0f\xaa";
197
198 static void svm_complete_interrupts(struct vcpu_svm *svm);
199
200 static unsigned long iopm_base;
201
202 struct kvm_ldttss_desc {
203 u16 limit0;
204 u16 base0;
205 unsigned base1:8, type:5, dpl:2, p:1;
206 unsigned limit1:4, zero0:3, g:1, base2:8;
207 u32 base3;
208 u32 zero1;
209 } __attribute__((packed));
210
211 DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
212
213 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
214
215 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
216 #define MSRS_RANGE_SIZE 2048
217 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
218
svm_msrpm_offset(u32 msr)219 u32 svm_msrpm_offset(u32 msr)
220 {
221 u32 offset;
222 int i;
223
224 for (i = 0; i < NUM_MSR_MAPS; i++) {
225 if (msr < msrpm_ranges[i] ||
226 msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
227 continue;
228
229 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
230 offset += (i * MSRS_RANGE_SIZE); /* add range offset */
231
232 /* Now we have the u8 offset - but need the u32 offset */
233 return offset / 4;
234 }
235
236 /* MSR not in any range */
237 return MSR_INVALID;
238 }
239
240 #define MAX_INST_SIZE 15
241
clgi(void)242 static inline void clgi(void)
243 {
244 asm volatile (__ex("clgi"));
245 }
246
stgi(void)247 static inline void stgi(void)
248 {
249 asm volatile (__ex("stgi"));
250 }
251
invlpga(unsigned long addr,u32 asid)252 static inline void invlpga(unsigned long addr, u32 asid)
253 {
254 asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
255 }
256
get_max_npt_level(void)257 static int get_max_npt_level(void)
258 {
259 #ifdef CONFIG_X86_64
260 return PT64_ROOT_4LEVEL;
261 #else
262 return PT32E_ROOT_LEVEL;
263 #endif
264 }
265
svm_set_efer(struct kvm_vcpu * vcpu,u64 efer)266 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
267 {
268 struct vcpu_svm *svm = to_svm(vcpu);
269 u64 old_efer = vcpu->arch.efer;
270 vcpu->arch.efer = efer;
271
272 if (!npt_enabled) {
273 /* Shadow paging assumes NX to be available. */
274 efer |= EFER_NX;
275
276 if (!(efer & EFER_LMA))
277 efer &= ~EFER_LME;
278 }
279
280 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
281 if (!(efer & EFER_SVME)) {
282 svm_leave_nested(vcpu);
283 svm_set_gif(svm, true);
284
285 /*
286 * Free the nested guest state, unless we are in SMM.
287 * In this case we will return to the nested guest
288 * as soon as we leave SMM.
289 */
290 if (!is_smm(&svm->vcpu))
291 svm_free_nested(svm);
292
293 } else {
294 int ret = svm_allocate_nested(svm);
295
296 if (ret) {
297 vcpu->arch.efer = old_efer;
298 return ret;
299 }
300 }
301 }
302
303 svm->vmcb->save.efer = efer | EFER_SVME;
304 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
305 return 0;
306 }
307
is_external_interrupt(u32 info)308 static int is_external_interrupt(u32 info)
309 {
310 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
311 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
312 }
313
svm_get_interrupt_shadow(struct kvm_vcpu * vcpu)314 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
315 {
316 struct vcpu_svm *svm = to_svm(vcpu);
317 u32 ret = 0;
318
319 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
320 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
321 return ret;
322 }
323
svm_set_interrupt_shadow(struct kvm_vcpu * vcpu,int mask)324 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
325 {
326 struct vcpu_svm *svm = to_svm(vcpu);
327
328 if (mask == 0)
329 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
330 else
331 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
332
333 }
334
skip_emulated_instruction(struct kvm_vcpu * vcpu)335 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
336 {
337 struct vcpu_svm *svm = to_svm(vcpu);
338
339 if (nrips && svm->vmcb->control.next_rip != 0) {
340 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
341 svm->next_rip = svm->vmcb->control.next_rip;
342 }
343
344 if (!svm->next_rip) {
345 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
346 return 0;
347 } else {
348 kvm_rip_write(vcpu, svm->next_rip);
349 }
350 svm_set_interrupt_shadow(vcpu, 0);
351
352 return 1;
353 }
354
svm_queue_exception(struct kvm_vcpu * vcpu)355 static void svm_queue_exception(struct kvm_vcpu *vcpu)
356 {
357 struct vcpu_svm *svm = to_svm(vcpu);
358 unsigned nr = vcpu->arch.exception.nr;
359 bool has_error_code = vcpu->arch.exception.has_error_code;
360 u32 error_code = vcpu->arch.exception.error_code;
361
362 kvm_deliver_exception_payload(&svm->vcpu);
363
364 if (nr == BP_VECTOR && !nrips) {
365 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
366
367 /*
368 * For guest debugging where we have to reinject #BP if some
369 * INT3 is guest-owned:
370 * Emulate nRIP by moving RIP forward. Will fail if injection
371 * raises a fault that is not intercepted. Still better than
372 * failing in all cases.
373 */
374 (void)skip_emulated_instruction(&svm->vcpu);
375 rip = kvm_rip_read(&svm->vcpu);
376 svm->int3_rip = rip + svm->vmcb->save.cs.base;
377 svm->int3_injected = rip - old_rip;
378 }
379
380 svm->vmcb->control.event_inj = nr
381 | SVM_EVTINJ_VALID
382 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
383 | SVM_EVTINJ_TYPE_EXEPT;
384 svm->vmcb->control.event_inj_err = error_code;
385 }
386
svm_init_erratum_383(void)387 static void svm_init_erratum_383(void)
388 {
389 u32 low, high;
390 int err;
391 u64 val;
392
393 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
394 return;
395
396 /* Use _safe variants to not break nested virtualization */
397 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
398 if (err)
399 return;
400
401 val |= (1ULL << 47);
402
403 low = lower_32_bits(val);
404 high = upper_32_bits(val);
405
406 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
407
408 erratum_383_found = true;
409 }
410
svm_init_osvw(struct kvm_vcpu * vcpu)411 static void svm_init_osvw(struct kvm_vcpu *vcpu)
412 {
413 /*
414 * Guests should see errata 400 and 415 as fixed (assuming that
415 * HLT and IO instructions are intercepted).
416 */
417 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
418 vcpu->arch.osvw.status = osvw_status & ~(6ULL);
419
420 /*
421 * By increasing VCPU's osvw.length to 3 we are telling the guest that
422 * all osvw.status bits inside that length, including bit 0 (which is
423 * reserved for erratum 298), are valid. However, if host processor's
424 * osvw_len is 0 then osvw_status[0] carries no information. We need to
425 * be conservative here and therefore we tell the guest that erratum 298
426 * is present (because we really don't know).
427 */
428 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
429 vcpu->arch.osvw.status |= 1;
430 }
431
has_svm(void)432 static int has_svm(void)
433 {
434 const char *msg;
435
436 if (!cpu_has_svm(&msg)) {
437 printk(KERN_INFO "has_svm: %s\n", msg);
438 return 0;
439 }
440
441 if (sev_active()) {
442 pr_info("KVM is unsupported when running as an SEV guest\n");
443 return 0;
444 }
445
446 return 1;
447 }
448
svm_hardware_disable(void)449 static void svm_hardware_disable(void)
450 {
451 /* Make sure we clean up behind us */
452 if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
453 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
454
455 cpu_svm_disable();
456
457 amd_pmu_disable_virt();
458 }
459
svm_hardware_enable(void)460 static int svm_hardware_enable(void)
461 {
462
463 struct svm_cpu_data *sd;
464 uint64_t efer;
465 struct desc_struct *gdt;
466 int me = raw_smp_processor_id();
467
468 rdmsrl(MSR_EFER, efer);
469 if (efer & EFER_SVME)
470 return -EBUSY;
471
472 if (!has_svm()) {
473 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
474 return -EINVAL;
475 }
476 sd = per_cpu(svm_data, me);
477 if (!sd) {
478 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
479 return -EINVAL;
480 }
481
482 sd->asid_generation = 1;
483 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
484 sd->next_asid = sd->max_asid + 1;
485 sd->min_asid = max_sev_asid + 1;
486
487 gdt = get_current_gdt_rw();
488 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
489
490 wrmsrl(MSR_EFER, efer | EFER_SVME);
491
492 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
493
494 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
495 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
496 __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
497 }
498
499
500 /*
501 * Get OSVW bits.
502 *
503 * Note that it is possible to have a system with mixed processor
504 * revisions and therefore different OSVW bits. If bits are not the same
505 * on different processors then choose the worst case (i.e. if erratum
506 * is present on one processor and not on another then assume that the
507 * erratum is present everywhere).
508 */
509 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
510 uint64_t len, status = 0;
511 int err;
512
513 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
514 if (!err)
515 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
516 &err);
517
518 if (err)
519 osvw_status = osvw_len = 0;
520 else {
521 if (len < osvw_len)
522 osvw_len = len;
523 osvw_status |= status;
524 osvw_status &= (1ULL << osvw_len) - 1;
525 }
526 } else
527 osvw_status = osvw_len = 0;
528
529 svm_init_erratum_383();
530
531 amd_pmu_enable_virt();
532
533 return 0;
534 }
535
svm_cpu_uninit(int cpu)536 static void svm_cpu_uninit(int cpu)
537 {
538 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
539
540 if (!sd)
541 return;
542
543 per_cpu(svm_data, cpu) = NULL;
544 kfree(sd->sev_vmcbs);
545 __free_page(sd->save_area);
546 kfree(sd);
547 }
548
svm_cpu_init(int cpu)549 static int svm_cpu_init(int cpu)
550 {
551 struct svm_cpu_data *sd;
552
553 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
554 if (!sd)
555 return -ENOMEM;
556 sd->cpu = cpu;
557 sd->save_area = alloc_page(GFP_KERNEL);
558 if (!sd->save_area)
559 goto free_cpu_data;
560
561 if (svm_sev_enabled()) {
562 sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
563 sizeof(void *),
564 GFP_KERNEL);
565 if (!sd->sev_vmcbs)
566 goto free_save_area;
567 }
568
569 per_cpu(svm_data, cpu) = sd;
570
571 return 0;
572
573 free_save_area:
574 __free_page(sd->save_area);
575 free_cpu_data:
576 kfree(sd);
577 return -ENOMEM;
578
579 }
580
direct_access_msr_slot(u32 msr)581 static int direct_access_msr_slot(u32 msr)
582 {
583 u32 i;
584
585 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
586 if (direct_access_msrs[i].index == msr)
587 return i;
588
589 return -ENOENT;
590 }
591
set_shadow_msr_intercept(struct kvm_vcpu * vcpu,u32 msr,int read,int write)592 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
593 int write)
594 {
595 struct vcpu_svm *svm = to_svm(vcpu);
596 int slot = direct_access_msr_slot(msr);
597
598 if (slot == -ENOENT)
599 return;
600
601 /* Set the shadow bitmaps to the desired intercept states */
602 if (read)
603 set_bit(slot, svm->shadow_msr_intercept.read);
604 else
605 clear_bit(slot, svm->shadow_msr_intercept.read);
606
607 if (write)
608 set_bit(slot, svm->shadow_msr_intercept.write);
609 else
610 clear_bit(slot, svm->shadow_msr_intercept.write);
611 }
612
valid_msr_intercept(u32 index)613 static bool valid_msr_intercept(u32 index)
614 {
615 return direct_access_msr_slot(index) != -ENOENT;
616 }
617
msr_write_intercepted(struct kvm_vcpu * vcpu,u32 msr)618 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
619 {
620 u8 bit_write;
621 unsigned long tmp;
622 u32 offset;
623 u32 *msrpm;
624
625 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
626 to_svm(vcpu)->msrpm;
627
628 offset = svm_msrpm_offset(msr);
629 bit_write = 2 * (msr & 0x0f) + 1;
630 tmp = msrpm[offset];
631
632 BUG_ON(offset == MSR_INVALID);
633
634 return !!test_bit(bit_write, &tmp);
635 }
636
set_msr_interception_bitmap(struct kvm_vcpu * vcpu,u32 * msrpm,u32 msr,int read,int write)637 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
638 u32 msr, int read, int write)
639 {
640 u8 bit_read, bit_write;
641 unsigned long tmp;
642 u32 offset;
643
644 /*
645 * If this warning triggers extend the direct_access_msrs list at the
646 * beginning of the file
647 */
648 WARN_ON(!valid_msr_intercept(msr));
649
650 /* Enforce non allowed MSRs to trap */
651 if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
652 read = 0;
653
654 if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
655 write = 0;
656
657 offset = svm_msrpm_offset(msr);
658 bit_read = 2 * (msr & 0x0f);
659 bit_write = 2 * (msr & 0x0f) + 1;
660 tmp = msrpm[offset];
661
662 BUG_ON(offset == MSR_INVALID);
663
664 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
665 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
666
667 msrpm[offset] = tmp;
668 }
669
set_msr_interception(struct kvm_vcpu * vcpu,u32 * msrpm,u32 msr,int read,int write)670 static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
671 int read, int write)
672 {
673 set_shadow_msr_intercept(vcpu, msr, read, write);
674 set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
675 }
676
svm_vcpu_alloc_msrpm(void)677 u32 *svm_vcpu_alloc_msrpm(void)
678 {
679 struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
680 u32 *msrpm;
681
682 if (!pages)
683 return NULL;
684
685 msrpm = page_address(pages);
686 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
687
688 return msrpm;
689 }
690
svm_vcpu_init_msrpm(struct kvm_vcpu * vcpu,u32 * msrpm)691 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
692 {
693 int i;
694
695 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
696 if (!direct_access_msrs[i].always)
697 continue;
698 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
699 }
700 }
701
702
svm_vcpu_free_msrpm(u32 * msrpm)703 void svm_vcpu_free_msrpm(u32 *msrpm)
704 {
705 __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
706 }
707
svm_msr_filter_changed(struct kvm_vcpu * vcpu)708 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
709 {
710 struct vcpu_svm *svm = to_svm(vcpu);
711 u32 i;
712
713 /*
714 * Set intercept permissions for all direct access MSRs again. They
715 * will automatically get filtered through the MSR filter, so we are
716 * back in sync after this.
717 */
718 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
719 u32 msr = direct_access_msrs[i].index;
720 u32 read = test_bit(i, svm->shadow_msr_intercept.read);
721 u32 write = test_bit(i, svm->shadow_msr_intercept.write);
722
723 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
724 }
725 }
726
add_msr_offset(u32 offset)727 static void add_msr_offset(u32 offset)
728 {
729 int i;
730
731 for (i = 0; i < MSRPM_OFFSETS; ++i) {
732
733 /* Offset already in list? */
734 if (msrpm_offsets[i] == offset)
735 return;
736
737 /* Slot used by another offset? */
738 if (msrpm_offsets[i] != MSR_INVALID)
739 continue;
740
741 /* Add offset to list */
742 msrpm_offsets[i] = offset;
743
744 return;
745 }
746
747 /*
748 * If this BUG triggers the msrpm_offsets table has an overflow. Just
749 * increase MSRPM_OFFSETS in this case.
750 */
751 BUG();
752 }
753
init_msrpm_offsets(void)754 static void init_msrpm_offsets(void)
755 {
756 int i;
757
758 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
759
760 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
761 u32 offset;
762
763 offset = svm_msrpm_offset(direct_access_msrs[i].index);
764 BUG_ON(offset == MSR_INVALID);
765
766 add_msr_offset(offset);
767 }
768 }
769
svm_enable_lbrv(struct kvm_vcpu * vcpu)770 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
771 {
772 struct vcpu_svm *svm = to_svm(vcpu);
773
774 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
775 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
776 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
777 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
778 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
779 }
780
svm_disable_lbrv(struct kvm_vcpu * vcpu)781 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
782 {
783 struct vcpu_svm *svm = to_svm(vcpu);
784
785 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
786 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
787 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
788 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
789 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
790 }
791
disable_nmi_singlestep(struct vcpu_svm * svm)792 void disable_nmi_singlestep(struct vcpu_svm *svm)
793 {
794 svm->nmi_singlestep = false;
795
796 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
797 /* Clear our flags if they were not set by the guest */
798 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
799 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
800 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
801 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
802 }
803 }
804
grow_ple_window(struct kvm_vcpu * vcpu)805 static void grow_ple_window(struct kvm_vcpu *vcpu)
806 {
807 struct vcpu_svm *svm = to_svm(vcpu);
808 struct vmcb_control_area *control = &svm->vmcb->control;
809 int old = control->pause_filter_count;
810
811 control->pause_filter_count = __grow_ple_window(old,
812 pause_filter_count,
813 pause_filter_count_grow,
814 pause_filter_count_max);
815
816 if (control->pause_filter_count != old) {
817 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
818 trace_kvm_ple_window_update(vcpu->vcpu_id,
819 control->pause_filter_count, old);
820 }
821 }
822
shrink_ple_window(struct kvm_vcpu * vcpu)823 static void shrink_ple_window(struct kvm_vcpu *vcpu)
824 {
825 struct vcpu_svm *svm = to_svm(vcpu);
826 struct vmcb_control_area *control = &svm->vmcb->control;
827 int old = control->pause_filter_count;
828
829 control->pause_filter_count =
830 __shrink_ple_window(old,
831 pause_filter_count,
832 pause_filter_count_shrink,
833 pause_filter_count);
834 if (control->pause_filter_count != old) {
835 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
836 trace_kvm_ple_window_update(vcpu->vcpu_id,
837 control->pause_filter_count, old);
838 }
839 }
840
841 /*
842 * The default MMIO mask is a single bit (excluding the present bit),
843 * which could conflict with the memory encryption bit. Check for
844 * memory encryption support and override the default MMIO mask if
845 * memory encryption is enabled.
846 */
svm_adjust_mmio_mask(void)847 static __init void svm_adjust_mmio_mask(void)
848 {
849 unsigned int enc_bit, mask_bit;
850 u64 msr, mask;
851
852 /* If there is no memory encryption support, use existing mask */
853 if (cpuid_eax(0x80000000) < 0x8000001f)
854 return;
855
856 /* If memory encryption is not enabled, use existing mask */
857 rdmsrl(MSR_K8_SYSCFG, msr);
858 if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
859 return;
860
861 enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
862 mask_bit = boot_cpu_data.x86_phys_bits;
863
864 /* Increment the mask bit if it is the same as the encryption bit */
865 if (enc_bit == mask_bit)
866 mask_bit++;
867
868 /*
869 * If the mask bit location is below 52, then some bits above the
870 * physical addressing limit will always be reserved, so use the
871 * rsvd_bits() function to generate the mask. This mask, along with
872 * the present bit, will be used to generate a page fault with
873 * PFER.RSV = 1.
874 *
875 * If the mask bit location is 52 (or above), then clear the mask.
876 */
877 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
878
879 kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
880 }
881
svm_hardware_teardown(void)882 static void svm_hardware_teardown(void)
883 {
884 int cpu;
885
886 if (svm_sev_enabled())
887 sev_hardware_teardown();
888
889 for_each_possible_cpu(cpu)
890 svm_cpu_uninit(cpu);
891
892 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
893 iopm_base = 0;
894 }
895
svm_set_cpu_caps(void)896 static __init void svm_set_cpu_caps(void)
897 {
898 kvm_set_cpu_caps();
899
900 supported_xss = 0;
901
902 /* CPUID 0x80000001 and 0x8000000A (SVM features) */
903 if (nested) {
904 kvm_cpu_cap_set(X86_FEATURE_SVM);
905
906 if (nrips)
907 kvm_cpu_cap_set(X86_FEATURE_NRIPS);
908
909 if (npt_enabled)
910 kvm_cpu_cap_set(X86_FEATURE_NPT);
911 }
912
913 /* CPUID 0x80000008 */
914 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
915 boot_cpu_has(X86_FEATURE_AMD_SSBD))
916 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
917
918 /* Enable INVPCID feature */
919 kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
920 }
921
svm_hardware_setup(void)922 static __init int svm_hardware_setup(void)
923 {
924 int cpu;
925 struct page *iopm_pages;
926 void *iopm_va;
927 int r;
928
929 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
930
931 if (!iopm_pages)
932 return -ENOMEM;
933
934 iopm_va = page_address(iopm_pages);
935 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
936 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
937
938 init_msrpm_offsets();
939
940 supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
941
942 if (boot_cpu_has(X86_FEATURE_NX))
943 kvm_enable_efer_bits(EFER_NX);
944
945 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
946 kvm_enable_efer_bits(EFER_FFXSR);
947
948 if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
949 kvm_has_tsc_control = true;
950 kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
951 kvm_tsc_scaling_ratio_frac_bits = 32;
952 }
953
954 /* Check for pause filtering support */
955 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
956 pause_filter_count = 0;
957 pause_filter_thresh = 0;
958 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
959 pause_filter_thresh = 0;
960 }
961
962 if (nested) {
963 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
964 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
965 }
966
967 if (sev) {
968 if (boot_cpu_has(X86_FEATURE_SEV) &&
969 IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
970 r = sev_hardware_setup();
971 if (r)
972 sev = false;
973 } else {
974 sev = false;
975 }
976 }
977
978 svm_adjust_mmio_mask();
979
980 for_each_possible_cpu(cpu) {
981 r = svm_cpu_init(cpu);
982 if (r)
983 goto err;
984 }
985
986 if (!boot_cpu_has(X86_FEATURE_NPT))
987 npt_enabled = false;
988
989 if (npt_enabled && !npt)
990 npt_enabled = false;
991
992 kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
993 pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
994
995 if (nrips) {
996 if (!boot_cpu_has(X86_FEATURE_NRIPS))
997 nrips = false;
998 }
999
1000 if (avic) {
1001 if (!npt_enabled ||
1002 !boot_cpu_has(X86_FEATURE_AVIC) ||
1003 !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
1004 avic = false;
1005 } else {
1006 pr_info("AVIC enabled\n");
1007
1008 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1009 }
1010 }
1011
1012 if (vls) {
1013 if (!npt_enabled ||
1014 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
1015 !IS_ENABLED(CONFIG_X86_64)) {
1016 vls = false;
1017 } else {
1018 pr_info("Virtual VMLOAD VMSAVE supported\n");
1019 }
1020 }
1021
1022 if (vgif) {
1023 if (!boot_cpu_has(X86_FEATURE_VGIF))
1024 vgif = false;
1025 else
1026 pr_info("Virtual GIF supported\n");
1027 }
1028
1029 svm_set_cpu_caps();
1030
1031 /*
1032 * It seems that on AMD processors PTE's accessed bit is
1033 * being set by the CPU hardware before the NPF vmexit.
1034 * This is not expected behaviour and our tests fail because
1035 * of it.
1036 * A workaround here is to disable support for
1037 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
1038 * In this case userspace can know if there is support using
1039 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
1040 * it
1041 * If future AMD CPU models change the behaviour described above,
1042 * this variable can be changed accordingly
1043 */
1044 allow_smaller_maxphyaddr = !npt_enabled;
1045
1046 return 0;
1047
1048 err:
1049 svm_hardware_teardown();
1050 return r;
1051 }
1052
init_seg(struct vmcb_seg * seg)1053 static void init_seg(struct vmcb_seg *seg)
1054 {
1055 seg->selector = 0;
1056 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1057 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1058 seg->limit = 0xffff;
1059 seg->base = 0;
1060 }
1061
init_sys_seg(struct vmcb_seg * seg,uint32_t type)1062 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1063 {
1064 seg->selector = 0;
1065 seg->attrib = SVM_SELECTOR_P_MASK | type;
1066 seg->limit = 0xffff;
1067 seg->base = 0;
1068 }
1069
svm_write_l1_tsc_offset(struct kvm_vcpu * vcpu,u64 offset)1070 static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1071 {
1072 struct vcpu_svm *svm = to_svm(vcpu);
1073 u64 g_tsc_offset = 0;
1074
1075 if (is_guest_mode(vcpu)) {
1076 /* Write L1's TSC offset. */
1077 g_tsc_offset = svm->vmcb->control.tsc_offset -
1078 svm->nested.hsave->control.tsc_offset;
1079 svm->nested.hsave->control.tsc_offset = offset;
1080 }
1081
1082 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1083 svm->vmcb->control.tsc_offset - g_tsc_offset,
1084 offset);
1085
1086 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1087
1088 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1089 return svm->vmcb->control.tsc_offset;
1090 }
1091
svm_check_invpcid(struct vcpu_svm * svm)1092 static void svm_check_invpcid(struct vcpu_svm *svm)
1093 {
1094 /*
1095 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1096 * roots, or if INVPCID is disabled in the guest to inject #UD.
1097 */
1098 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1099 if (!npt_enabled ||
1100 !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1101 svm_set_intercept(svm, INTERCEPT_INVPCID);
1102 else
1103 svm_clr_intercept(svm, INTERCEPT_INVPCID);
1104 }
1105 }
1106
init_vmcb(struct vcpu_svm * svm)1107 static void init_vmcb(struct vcpu_svm *svm)
1108 {
1109 struct vmcb_control_area *control = &svm->vmcb->control;
1110 struct vmcb_save_area *save = &svm->vmcb->save;
1111
1112 svm->vcpu.arch.hflags = 0;
1113
1114 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1115 svm_set_intercept(svm, INTERCEPT_CR3_READ);
1116 svm_set_intercept(svm, INTERCEPT_CR4_READ);
1117 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1118 svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1119 svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1120 if (!kvm_vcpu_apicv_active(&svm->vcpu))
1121 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1122
1123 set_dr_intercepts(svm);
1124
1125 set_exception_intercept(svm, PF_VECTOR);
1126 set_exception_intercept(svm, UD_VECTOR);
1127 set_exception_intercept(svm, MC_VECTOR);
1128 set_exception_intercept(svm, AC_VECTOR);
1129 set_exception_intercept(svm, DB_VECTOR);
1130 /*
1131 * Guest access to VMware backdoor ports could legitimately
1132 * trigger #GP because of TSS I/O permission bitmap.
1133 * We intercept those #GP and allow access to them anyway
1134 * as VMware does.
1135 */
1136 if (enable_vmware_backdoor)
1137 set_exception_intercept(svm, GP_VECTOR);
1138
1139 svm_set_intercept(svm, INTERCEPT_INTR);
1140 svm_set_intercept(svm, INTERCEPT_NMI);
1141 svm_set_intercept(svm, INTERCEPT_SMI);
1142 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1143 svm_set_intercept(svm, INTERCEPT_RDPMC);
1144 svm_set_intercept(svm, INTERCEPT_CPUID);
1145 svm_set_intercept(svm, INTERCEPT_INVD);
1146 svm_set_intercept(svm, INTERCEPT_INVLPG);
1147 svm_set_intercept(svm, INTERCEPT_INVLPGA);
1148 svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1149 svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1150 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1151 svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1152 svm_set_intercept(svm, INTERCEPT_VMRUN);
1153 svm_set_intercept(svm, INTERCEPT_VMMCALL);
1154 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1155 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1156 svm_set_intercept(svm, INTERCEPT_STGI);
1157 svm_set_intercept(svm, INTERCEPT_CLGI);
1158 svm_set_intercept(svm, INTERCEPT_SKINIT);
1159 svm_set_intercept(svm, INTERCEPT_WBINVD);
1160 svm_set_intercept(svm, INTERCEPT_XSETBV);
1161 svm_set_intercept(svm, INTERCEPT_RDPRU);
1162 svm_set_intercept(svm, INTERCEPT_RSM);
1163
1164 if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
1165 svm_set_intercept(svm, INTERCEPT_MONITOR);
1166 svm_set_intercept(svm, INTERCEPT_MWAIT);
1167 }
1168
1169 if (!kvm_hlt_in_guest(svm->vcpu.kvm))
1170 svm_set_intercept(svm, INTERCEPT_HLT);
1171
1172 control->iopm_base_pa = __sme_set(iopm_base);
1173 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1174 control->int_ctl = V_INTR_MASKING_MASK;
1175
1176 init_seg(&save->es);
1177 init_seg(&save->ss);
1178 init_seg(&save->ds);
1179 init_seg(&save->fs);
1180 init_seg(&save->gs);
1181
1182 save->cs.selector = 0xf000;
1183 save->cs.base = 0xffff0000;
1184 /* Executable/Readable Code Segment */
1185 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1186 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1187 save->cs.limit = 0xffff;
1188
1189 save->gdtr.limit = 0xffff;
1190 save->idtr.limit = 0xffff;
1191
1192 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1193 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1194
1195 svm_set_cr4(&svm->vcpu, 0);
1196 svm_set_efer(&svm->vcpu, 0);
1197 save->dr6 = 0xffff0ff0;
1198 kvm_set_rflags(&svm->vcpu, 2);
1199 save->rip = 0x0000fff0;
1200 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1201
1202 /*
1203 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1204 * It also updates the guest-visible cr0 value.
1205 */
1206 svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1207 kvm_mmu_reset_context(&svm->vcpu);
1208
1209 save->cr4 = X86_CR4_PAE;
1210 /* rdx = ?? */
1211
1212 if (npt_enabled) {
1213 /* Setup VMCB for Nested Paging */
1214 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1215 svm_clr_intercept(svm, INTERCEPT_INVLPG);
1216 clr_exception_intercept(svm, PF_VECTOR);
1217 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1218 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1219 save->g_pat = svm->vcpu.arch.pat;
1220 save->cr3 = 0;
1221 save->cr4 = 0;
1222 }
1223 svm->asid_generation = 0;
1224
1225 svm->nested.vmcb12_gpa = 0;
1226 svm->vcpu.arch.hflags = 0;
1227
1228 if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
1229 control->pause_filter_count = pause_filter_count;
1230 if (pause_filter_thresh)
1231 control->pause_filter_thresh = pause_filter_thresh;
1232 svm_set_intercept(svm, INTERCEPT_PAUSE);
1233 } else {
1234 svm_clr_intercept(svm, INTERCEPT_PAUSE);
1235 }
1236
1237 svm_check_invpcid(svm);
1238
1239 if (kvm_vcpu_apicv_active(&svm->vcpu))
1240 avic_init_vmcb(svm);
1241
1242 /*
1243 * If hardware supports Virtual VMLOAD VMSAVE then enable it
1244 * in VMCB and clear intercepts to avoid #VMEXIT.
1245 */
1246 if (vls) {
1247 svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1248 svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1249 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1250 }
1251
1252 if (vgif) {
1253 svm_clr_intercept(svm, INTERCEPT_STGI);
1254 svm_clr_intercept(svm, INTERCEPT_CLGI);
1255 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1256 }
1257
1258 if (sev_guest(svm->vcpu.kvm)) {
1259 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
1260 clr_exception_intercept(svm, UD_VECTOR);
1261 }
1262
1263 vmcb_mark_all_dirty(svm->vmcb);
1264
1265 enable_gif(svm);
1266
1267 }
1268
svm_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)1269 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1270 {
1271 struct vcpu_svm *svm = to_svm(vcpu);
1272 u32 dummy;
1273 u32 eax = 1;
1274
1275 svm->spec_ctrl = 0;
1276 svm->virt_spec_ctrl = 0;
1277
1278 if (!init_event) {
1279 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1280 MSR_IA32_APICBASE_ENABLE;
1281 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1282 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1283 }
1284 init_vmcb(svm);
1285
1286 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
1287 kvm_rdx_write(vcpu, eax);
1288
1289 if (kvm_vcpu_apicv_active(vcpu) && !init_event)
1290 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
1291 }
1292
svm_create_vcpu(struct kvm_vcpu * vcpu)1293 static int svm_create_vcpu(struct kvm_vcpu *vcpu)
1294 {
1295 struct vcpu_svm *svm;
1296 struct page *vmcb_page;
1297 int err;
1298
1299 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1300 svm = to_svm(vcpu);
1301
1302 err = -ENOMEM;
1303 vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1304 if (!vmcb_page)
1305 goto out;
1306
1307 err = avic_init_vcpu(svm);
1308 if (err)
1309 goto error_free_vmcb_page;
1310
1311 /* We initialize this flag to true to make sure that the is_running
1312 * bit would be set the first time the vcpu is loaded.
1313 */
1314 if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
1315 svm->avic_is_running = true;
1316
1317 svm->msrpm = svm_vcpu_alloc_msrpm();
1318 if (!svm->msrpm) {
1319 err = -ENOMEM;
1320 goto error_free_vmcb_page;
1321 }
1322
1323 svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1324
1325 svm->vmcb = page_address(vmcb_page);
1326 svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
1327 svm->asid_generation = 0;
1328 init_vmcb(svm);
1329
1330 svm_init_osvw(vcpu);
1331 vcpu->arch.microcode_version = 0x01000065;
1332
1333 return 0;
1334
1335 error_free_vmcb_page:
1336 __free_page(vmcb_page);
1337 out:
1338 return err;
1339 }
1340
svm_clear_current_vmcb(struct vmcb * vmcb)1341 static void svm_clear_current_vmcb(struct vmcb *vmcb)
1342 {
1343 int i;
1344
1345 for_each_online_cpu(i)
1346 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
1347 }
1348
svm_free_vcpu(struct kvm_vcpu * vcpu)1349 static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1350 {
1351 struct vcpu_svm *svm = to_svm(vcpu);
1352
1353 /*
1354 * The vmcb page can be recycled, causing a false negative in
1355 * svm_vcpu_load(). So, ensure that no logical CPU has this
1356 * vmcb page recorded as its current vmcb.
1357 */
1358 svm_clear_current_vmcb(svm->vmcb);
1359
1360 svm_free_nested(svm);
1361
1362 __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
1363 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1364 }
1365
svm_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1366 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1367 {
1368 struct vcpu_svm *svm = to_svm(vcpu);
1369 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1370 int i;
1371
1372 if (unlikely(cpu != vcpu->cpu)) {
1373 svm->asid_generation = 0;
1374 vmcb_mark_all_dirty(svm->vmcb);
1375 }
1376
1377 #ifdef CONFIG_X86_64
1378 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1379 #endif
1380 savesegment(fs, svm->host.fs);
1381 savesegment(gs, svm->host.gs);
1382 svm->host.ldt = kvm_read_ldt();
1383
1384 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1385 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1386
1387 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1388 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
1389 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1390 __this_cpu_write(current_tsc_ratio, tsc_ratio);
1391 wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
1392 }
1393 }
1394 /* This assumes that the kernel never uses MSR_TSC_AUX */
1395 if (static_cpu_has(X86_FEATURE_RDTSCP))
1396 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1397
1398 if (sd->current_vmcb != svm->vmcb) {
1399 sd->current_vmcb = svm->vmcb;
1400 indirect_branch_prediction_barrier();
1401 }
1402 avic_vcpu_load(vcpu, cpu);
1403 }
1404
svm_vcpu_put(struct kvm_vcpu * vcpu)1405 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1406 {
1407 struct vcpu_svm *svm = to_svm(vcpu);
1408 int i;
1409
1410 avic_vcpu_put(vcpu);
1411
1412 ++vcpu->stat.host_state_reload;
1413 kvm_load_ldt(svm->host.ldt);
1414 #ifdef CONFIG_X86_64
1415 loadsegment(fs, svm->host.fs);
1416 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
1417 load_gs_index(svm->host.gs);
1418 #else
1419 #ifdef CONFIG_X86_32_LAZY_GS
1420 loadsegment(gs, svm->host.gs);
1421 #endif
1422 #endif
1423 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1424 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1425 }
1426
svm_get_rflags(struct kvm_vcpu * vcpu)1427 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1428 {
1429 struct vcpu_svm *svm = to_svm(vcpu);
1430 unsigned long rflags = svm->vmcb->save.rflags;
1431
1432 if (svm->nmi_singlestep) {
1433 /* Hide our flags if they were not set by the guest */
1434 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1435 rflags &= ~X86_EFLAGS_TF;
1436 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1437 rflags &= ~X86_EFLAGS_RF;
1438 }
1439 return rflags;
1440 }
1441
svm_set_rflags(struct kvm_vcpu * vcpu,unsigned long rflags)1442 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1443 {
1444 if (to_svm(vcpu)->nmi_singlestep)
1445 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1446
1447 /*
1448 * Any change of EFLAGS.VM is accompanied by a reload of SS
1449 * (caused by either a task switch or an inter-privilege IRET),
1450 * so we do not need to update the CPL here.
1451 */
1452 to_svm(vcpu)->vmcb->save.rflags = rflags;
1453 }
1454
svm_cache_reg(struct kvm_vcpu * vcpu,enum kvm_reg reg)1455 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1456 {
1457 switch (reg) {
1458 case VCPU_EXREG_PDPTR:
1459 BUG_ON(!npt_enabled);
1460 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1461 break;
1462 default:
1463 WARN_ON_ONCE(1);
1464 }
1465 }
1466
svm_set_vintr(struct vcpu_svm * svm)1467 static void svm_set_vintr(struct vcpu_svm *svm)
1468 {
1469 struct vmcb_control_area *control;
1470
1471 /* The following fields are ignored when AVIC is enabled */
1472 WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
1473 svm_set_intercept(svm, INTERCEPT_VINTR);
1474
1475 /*
1476 * This is just a dummy VINTR to actually cause a vmexit to happen.
1477 * Actual injection of virtual interrupts happens through EVENTINJ.
1478 */
1479 control = &svm->vmcb->control;
1480 control->int_vector = 0x0;
1481 control->int_ctl &= ~V_INTR_PRIO_MASK;
1482 control->int_ctl |= V_IRQ_MASK |
1483 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1484 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1485 }
1486
svm_clear_vintr(struct vcpu_svm * svm)1487 static void svm_clear_vintr(struct vcpu_svm *svm)
1488 {
1489 svm_clr_intercept(svm, INTERCEPT_VINTR);
1490
1491 /* Drop int_ctl fields related to VINTR injection. */
1492 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1493 if (is_guest_mode(&svm->vcpu)) {
1494 svm->nested.hsave->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1495
1496 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1497 (svm->nested.ctl.int_ctl & V_TPR_MASK));
1498 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1499 V_IRQ_INJECTION_BITS_MASK;
1500
1501 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1502 }
1503
1504 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1505 }
1506
svm_seg(struct kvm_vcpu * vcpu,int seg)1507 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1508 {
1509 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1510
1511 switch (seg) {
1512 case VCPU_SREG_CS: return &save->cs;
1513 case VCPU_SREG_DS: return &save->ds;
1514 case VCPU_SREG_ES: return &save->es;
1515 case VCPU_SREG_FS: return &save->fs;
1516 case VCPU_SREG_GS: return &save->gs;
1517 case VCPU_SREG_SS: return &save->ss;
1518 case VCPU_SREG_TR: return &save->tr;
1519 case VCPU_SREG_LDTR: return &save->ldtr;
1520 }
1521 BUG();
1522 return NULL;
1523 }
1524
svm_get_segment_base(struct kvm_vcpu * vcpu,int seg)1525 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1526 {
1527 struct vmcb_seg *s = svm_seg(vcpu, seg);
1528
1529 return s->base;
1530 }
1531
svm_get_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)1532 static void svm_get_segment(struct kvm_vcpu *vcpu,
1533 struct kvm_segment *var, int seg)
1534 {
1535 struct vmcb_seg *s = svm_seg(vcpu, seg);
1536
1537 var->base = s->base;
1538 var->limit = s->limit;
1539 var->selector = s->selector;
1540 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1541 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1542 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1543 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1544 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1545 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1546 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1547
1548 /*
1549 * AMD CPUs circa 2014 track the G bit for all segments except CS.
1550 * However, the SVM spec states that the G bit is not observed by the
1551 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1552 * So let's synthesize a legal G bit for all segments, this helps
1553 * running KVM nested. It also helps cross-vendor migration, because
1554 * Intel's vmentry has a check on the 'G' bit.
1555 */
1556 var->g = s->limit > 0xfffff;
1557
1558 /*
1559 * AMD's VMCB does not have an explicit unusable field, so emulate it
1560 * for cross vendor migration purposes by "not present"
1561 */
1562 var->unusable = !var->present;
1563
1564 switch (seg) {
1565 case VCPU_SREG_TR:
1566 /*
1567 * Work around a bug where the busy flag in the tr selector
1568 * isn't exposed
1569 */
1570 var->type |= 0x2;
1571 break;
1572 case VCPU_SREG_DS:
1573 case VCPU_SREG_ES:
1574 case VCPU_SREG_FS:
1575 case VCPU_SREG_GS:
1576 /*
1577 * The accessed bit must always be set in the segment
1578 * descriptor cache, although it can be cleared in the
1579 * descriptor, the cached bit always remains at 1. Since
1580 * Intel has a check on this, set it here to support
1581 * cross-vendor migration.
1582 */
1583 if (!var->unusable)
1584 var->type |= 0x1;
1585 break;
1586 case VCPU_SREG_SS:
1587 /*
1588 * On AMD CPUs sometimes the DB bit in the segment
1589 * descriptor is left as 1, although the whole segment has
1590 * been made unusable. Clear it here to pass an Intel VMX
1591 * entry check when cross vendor migrating.
1592 */
1593 if (var->unusable)
1594 var->db = 0;
1595 /* This is symmetric with svm_set_segment() */
1596 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1597 break;
1598 }
1599 }
1600
svm_get_cpl(struct kvm_vcpu * vcpu)1601 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1602 {
1603 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1604
1605 return save->cpl;
1606 }
1607
svm_get_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1608 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1609 {
1610 struct vcpu_svm *svm = to_svm(vcpu);
1611
1612 dt->size = svm->vmcb->save.idtr.limit;
1613 dt->address = svm->vmcb->save.idtr.base;
1614 }
1615
svm_set_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1616 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1617 {
1618 struct vcpu_svm *svm = to_svm(vcpu);
1619
1620 svm->vmcb->save.idtr.limit = dt->size;
1621 svm->vmcb->save.idtr.base = dt->address ;
1622 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1623 }
1624
svm_get_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1625 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1626 {
1627 struct vcpu_svm *svm = to_svm(vcpu);
1628
1629 dt->size = svm->vmcb->save.gdtr.limit;
1630 dt->address = svm->vmcb->save.gdtr.base;
1631 }
1632
svm_set_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1633 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1634 {
1635 struct vcpu_svm *svm = to_svm(vcpu);
1636
1637 svm->vmcb->save.gdtr.limit = dt->size;
1638 svm->vmcb->save.gdtr.base = dt->address ;
1639 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1640 }
1641
update_cr0_intercept(struct vcpu_svm * svm)1642 static void update_cr0_intercept(struct vcpu_svm *svm)
1643 {
1644 ulong gcr0 = svm->vcpu.arch.cr0;
1645 u64 *hcr0 = &svm->vmcb->save.cr0;
1646
1647 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1648 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1649
1650 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1651
1652 if (gcr0 == *hcr0) {
1653 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1654 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1655 } else {
1656 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1657 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1658 }
1659 }
1660
svm_set_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)1661 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1662 {
1663 struct vcpu_svm *svm = to_svm(vcpu);
1664
1665 #ifdef CONFIG_X86_64
1666 if (vcpu->arch.efer & EFER_LME) {
1667 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1668 vcpu->arch.efer |= EFER_LMA;
1669 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1670 }
1671
1672 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1673 vcpu->arch.efer &= ~EFER_LMA;
1674 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1675 }
1676 }
1677 #endif
1678 vcpu->arch.cr0 = cr0;
1679
1680 if (!npt_enabled)
1681 cr0 |= X86_CR0_PG | X86_CR0_WP;
1682
1683 /*
1684 * re-enable caching here because the QEMU bios
1685 * does not do it - this results in some delay at
1686 * reboot
1687 */
1688 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1689 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1690 svm->vmcb->save.cr0 = cr0;
1691 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1692 update_cr0_intercept(svm);
1693 }
1694
svm_set_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)1695 int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1696 {
1697 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1698 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1699
1700 if (cr4 & X86_CR4_VMXE)
1701 return 1;
1702
1703 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1704 svm_flush_tlb(vcpu);
1705
1706 vcpu->arch.cr4 = cr4;
1707 if (!npt_enabled)
1708 cr4 |= X86_CR4_PAE;
1709 cr4 |= host_cr4_mce;
1710 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1711 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1712 return 0;
1713 }
1714
svm_set_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)1715 static void svm_set_segment(struct kvm_vcpu *vcpu,
1716 struct kvm_segment *var, int seg)
1717 {
1718 struct vcpu_svm *svm = to_svm(vcpu);
1719 struct vmcb_seg *s = svm_seg(vcpu, seg);
1720
1721 s->base = var->base;
1722 s->limit = var->limit;
1723 s->selector = var->selector;
1724 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1725 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1726 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1727 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1728 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1729 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1730 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1731 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1732
1733 /*
1734 * This is always accurate, except if SYSRET returned to a segment
1735 * with SS.DPL != 3. Intel does not have this quirk, and always
1736 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1737 * would entail passing the CPL to userspace and back.
1738 */
1739 if (seg == VCPU_SREG_SS)
1740 /* This is symmetric with svm_get_segment() */
1741 svm->vmcb->save.cpl = (var->dpl & 3);
1742
1743 vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1744 }
1745
update_exception_bitmap(struct kvm_vcpu * vcpu)1746 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1747 {
1748 struct vcpu_svm *svm = to_svm(vcpu);
1749
1750 clr_exception_intercept(svm, BP_VECTOR);
1751
1752 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1753 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1754 set_exception_intercept(svm, BP_VECTOR);
1755 }
1756 }
1757
new_asid(struct vcpu_svm * svm,struct svm_cpu_data * sd)1758 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1759 {
1760 if (sd->next_asid > sd->max_asid) {
1761 ++sd->asid_generation;
1762 sd->next_asid = sd->min_asid;
1763 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1764 }
1765
1766 svm->asid_generation = sd->asid_generation;
1767 svm->vmcb->control.asid = sd->next_asid++;
1768
1769 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1770 }
1771
svm_set_dr6(struct vcpu_svm * svm,unsigned long value)1772 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1773 {
1774 struct vmcb *vmcb = svm->vmcb;
1775
1776 if (unlikely(value != vmcb->save.dr6)) {
1777 vmcb->save.dr6 = value;
1778 vmcb_mark_dirty(vmcb, VMCB_DR);
1779 }
1780 }
1781
svm_sync_dirty_debug_regs(struct kvm_vcpu * vcpu)1782 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1783 {
1784 struct vcpu_svm *svm = to_svm(vcpu);
1785
1786 get_debugreg(vcpu->arch.db[0], 0);
1787 get_debugreg(vcpu->arch.db[1], 1);
1788 get_debugreg(vcpu->arch.db[2], 2);
1789 get_debugreg(vcpu->arch.db[3], 3);
1790 /*
1791 * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here,
1792 * because db_interception might need it. We can do it before vmentry.
1793 */
1794 vcpu->arch.dr6 = svm->vmcb->save.dr6;
1795 vcpu->arch.dr7 = svm->vmcb->save.dr7;
1796 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1797 set_dr_intercepts(svm);
1798 }
1799
svm_set_dr7(struct kvm_vcpu * vcpu,unsigned long value)1800 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1801 {
1802 struct vcpu_svm *svm = to_svm(vcpu);
1803
1804 svm->vmcb->save.dr7 = value;
1805 vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1806 }
1807
pf_interception(struct vcpu_svm * svm)1808 static int pf_interception(struct vcpu_svm *svm)
1809 {
1810 u64 fault_address = svm->vmcb->control.exit_info_2;
1811 u64 error_code = svm->vmcb->control.exit_info_1;
1812
1813 return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
1814 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1815 svm->vmcb->control.insn_bytes : NULL,
1816 svm->vmcb->control.insn_len);
1817 }
1818
npf_interception(struct vcpu_svm * svm)1819 static int npf_interception(struct vcpu_svm *svm)
1820 {
1821 u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
1822 u64 error_code = svm->vmcb->control.exit_info_1;
1823
1824 trace_kvm_page_fault(fault_address, error_code);
1825 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1826 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1827 svm->vmcb->control.insn_bytes : NULL,
1828 svm->vmcb->control.insn_len);
1829 }
1830
db_interception(struct vcpu_svm * svm)1831 static int db_interception(struct vcpu_svm *svm)
1832 {
1833 struct kvm_run *kvm_run = svm->vcpu.run;
1834 struct kvm_vcpu *vcpu = &svm->vcpu;
1835
1836 if (!(svm->vcpu.guest_debug &
1837 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1838 !svm->nmi_singlestep) {
1839 u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1;
1840 kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
1841 return 1;
1842 }
1843
1844 if (svm->nmi_singlestep) {
1845 disable_nmi_singlestep(svm);
1846 /* Make sure we check for pending NMIs upon entry */
1847 kvm_make_request(KVM_REQ_EVENT, vcpu);
1848 }
1849
1850 if (svm->vcpu.guest_debug &
1851 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1852 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1853 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
1854 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
1855 kvm_run->debug.arch.pc =
1856 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1857 kvm_run->debug.arch.exception = DB_VECTOR;
1858 return 0;
1859 }
1860
1861 return 1;
1862 }
1863
bp_interception(struct vcpu_svm * svm)1864 static int bp_interception(struct vcpu_svm *svm)
1865 {
1866 struct kvm_run *kvm_run = svm->vcpu.run;
1867
1868 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1869 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1870 kvm_run->debug.arch.exception = BP_VECTOR;
1871 return 0;
1872 }
1873
ud_interception(struct vcpu_svm * svm)1874 static int ud_interception(struct vcpu_svm *svm)
1875 {
1876 return handle_ud(&svm->vcpu);
1877 }
1878
ac_interception(struct vcpu_svm * svm)1879 static int ac_interception(struct vcpu_svm *svm)
1880 {
1881 kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
1882 return 1;
1883 }
1884
gp_interception(struct vcpu_svm * svm)1885 static int gp_interception(struct vcpu_svm *svm)
1886 {
1887 struct kvm_vcpu *vcpu = &svm->vcpu;
1888 u32 error_code = svm->vmcb->control.exit_info_1;
1889
1890 WARN_ON_ONCE(!enable_vmware_backdoor);
1891
1892 /*
1893 * VMware backdoor emulation on #GP interception only handles IN{S},
1894 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
1895 */
1896 if (error_code) {
1897 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
1898 return 1;
1899 }
1900 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
1901 }
1902
is_erratum_383(void)1903 static bool is_erratum_383(void)
1904 {
1905 int err, i;
1906 u64 value;
1907
1908 if (!erratum_383_found)
1909 return false;
1910
1911 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
1912 if (err)
1913 return false;
1914
1915 /* Bit 62 may or may not be set for this mce */
1916 value &= ~(1ULL << 62);
1917
1918 if (value != 0xb600000000010015ULL)
1919 return false;
1920
1921 /* Clear MCi_STATUS registers */
1922 for (i = 0; i < 6; ++i)
1923 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
1924
1925 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
1926 if (!err) {
1927 u32 low, high;
1928
1929 value &= ~(1ULL << 2);
1930 low = lower_32_bits(value);
1931 high = upper_32_bits(value);
1932
1933 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
1934 }
1935
1936 /* Flush tlb to evict multi-match entries */
1937 __flush_tlb_all();
1938
1939 return true;
1940 }
1941
1942 /*
1943 * Trigger machine check on the host. We assume all the MSRs are already set up
1944 * by the CPU and that we still run on the same CPU as the MCE occurred on.
1945 * We pass a fake environment to the machine check handler because we want
1946 * the guest to be always treated like user space, no matter what context
1947 * it used internally.
1948 */
kvm_machine_check(void)1949 static void kvm_machine_check(void)
1950 {
1951 #if defined(CONFIG_X86_MCE)
1952 struct pt_regs regs = {
1953 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
1954 .flags = X86_EFLAGS_IF,
1955 };
1956
1957 do_machine_check(®s);
1958 #endif
1959 }
1960
svm_handle_mce(struct vcpu_svm * svm)1961 static void svm_handle_mce(struct vcpu_svm *svm)
1962 {
1963 if (is_erratum_383()) {
1964 /*
1965 * Erratum 383 triggered. Guest state is corrupt so kill the
1966 * guest.
1967 */
1968 pr_err("KVM: Guest triggered AMD Erratum 383\n");
1969
1970 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1971
1972 return;
1973 }
1974
1975 /*
1976 * On an #MC intercept the MCE handler is not called automatically in
1977 * the host. So do it by hand here.
1978 */
1979 kvm_machine_check();
1980 }
1981
mc_interception(struct vcpu_svm * svm)1982 static int mc_interception(struct vcpu_svm *svm)
1983 {
1984 return 1;
1985 }
1986
shutdown_interception(struct vcpu_svm * svm)1987 static int shutdown_interception(struct vcpu_svm *svm)
1988 {
1989 struct kvm_run *kvm_run = svm->vcpu.run;
1990
1991 /*
1992 * VMCB is undefined after a SHUTDOWN intercept
1993 * so reinitialize it.
1994 */
1995 clear_page(svm->vmcb);
1996 init_vmcb(svm);
1997
1998 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1999 return 0;
2000 }
2001
io_interception(struct vcpu_svm * svm)2002 static int io_interception(struct vcpu_svm *svm)
2003 {
2004 struct kvm_vcpu *vcpu = &svm->vcpu;
2005 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2006 int size, in, string;
2007 unsigned port;
2008
2009 ++svm->vcpu.stat.io_exits;
2010 string = (io_info & SVM_IOIO_STR_MASK) != 0;
2011 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2012 if (string)
2013 return kvm_emulate_instruction(vcpu, 0);
2014
2015 port = io_info >> 16;
2016 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2017 svm->next_rip = svm->vmcb->control.exit_info_2;
2018
2019 return kvm_fast_pio(&svm->vcpu, size, port, in);
2020 }
2021
nmi_interception(struct vcpu_svm * svm)2022 static int nmi_interception(struct vcpu_svm *svm)
2023 {
2024 return 1;
2025 }
2026
intr_interception(struct vcpu_svm * svm)2027 static int intr_interception(struct vcpu_svm *svm)
2028 {
2029 ++svm->vcpu.stat.irq_exits;
2030 return 1;
2031 }
2032
nop_on_interception(struct vcpu_svm * svm)2033 static int nop_on_interception(struct vcpu_svm *svm)
2034 {
2035 return 1;
2036 }
2037
halt_interception(struct vcpu_svm * svm)2038 static int halt_interception(struct vcpu_svm *svm)
2039 {
2040 return kvm_emulate_halt(&svm->vcpu);
2041 }
2042
vmmcall_interception(struct vcpu_svm * svm)2043 static int vmmcall_interception(struct vcpu_svm *svm)
2044 {
2045 return kvm_emulate_hypercall(&svm->vcpu);
2046 }
2047
vmload_interception(struct vcpu_svm * svm)2048 static int vmload_interception(struct vcpu_svm *svm)
2049 {
2050 struct vmcb *nested_vmcb;
2051 struct kvm_host_map map;
2052 int ret;
2053
2054 if (nested_svm_check_permissions(svm))
2055 return 1;
2056
2057 ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2058 if (ret) {
2059 if (ret == -EINVAL)
2060 kvm_inject_gp(&svm->vcpu, 0);
2061 return 1;
2062 }
2063
2064 nested_vmcb = map.hva;
2065
2066 ret = kvm_skip_emulated_instruction(&svm->vcpu);
2067
2068 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2069 kvm_vcpu_unmap(&svm->vcpu, &map, true);
2070
2071 return ret;
2072 }
2073
vmsave_interception(struct vcpu_svm * svm)2074 static int vmsave_interception(struct vcpu_svm *svm)
2075 {
2076 struct vmcb *nested_vmcb;
2077 struct kvm_host_map map;
2078 int ret;
2079
2080 if (nested_svm_check_permissions(svm))
2081 return 1;
2082
2083 ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2084 if (ret) {
2085 if (ret == -EINVAL)
2086 kvm_inject_gp(&svm->vcpu, 0);
2087 return 1;
2088 }
2089
2090 nested_vmcb = map.hva;
2091
2092 ret = kvm_skip_emulated_instruction(&svm->vcpu);
2093
2094 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2095 kvm_vcpu_unmap(&svm->vcpu, &map, true);
2096
2097 return ret;
2098 }
2099
vmrun_interception(struct vcpu_svm * svm)2100 static int vmrun_interception(struct vcpu_svm *svm)
2101 {
2102 if (nested_svm_check_permissions(svm))
2103 return 1;
2104
2105 return nested_svm_vmrun(svm);
2106 }
2107
svm_set_gif(struct vcpu_svm * svm,bool value)2108 void svm_set_gif(struct vcpu_svm *svm, bool value)
2109 {
2110 if (value) {
2111 /*
2112 * If VGIF is enabled, the STGI intercept is only added to
2113 * detect the opening of the SMI/NMI window; remove it now.
2114 * Likewise, clear the VINTR intercept, we will set it
2115 * again while processing KVM_REQ_EVENT if needed.
2116 */
2117 if (vgif_enabled(svm))
2118 svm_clr_intercept(svm, INTERCEPT_STGI);
2119 if (svm_is_intercept(svm, INTERCEPT_VINTR))
2120 svm_clear_vintr(svm);
2121
2122 enable_gif(svm);
2123 if (svm->vcpu.arch.smi_pending ||
2124 svm->vcpu.arch.nmi_pending ||
2125 kvm_cpu_has_injectable_intr(&svm->vcpu))
2126 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2127 } else {
2128 disable_gif(svm);
2129
2130 /*
2131 * After a CLGI no interrupts should come. But if vGIF is
2132 * in use, we still rely on the VINTR intercept (rather than
2133 * STGI) to detect an open interrupt window.
2134 */
2135 if (!vgif_enabled(svm))
2136 svm_clear_vintr(svm);
2137 }
2138 }
2139
stgi_interception(struct vcpu_svm * svm)2140 static int stgi_interception(struct vcpu_svm *svm)
2141 {
2142 int ret;
2143
2144 if (nested_svm_check_permissions(svm))
2145 return 1;
2146
2147 ret = kvm_skip_emulated_instruction(&svm->vcpu);
2148 svm_set_gif(svm, true);
2149 return ret;
2150 }
2151
clgi_interception(struct vcpu_svm * svm)2152 static int clgi_interception(struct vcpu_svm *svm)
2153 {
2154 int ret;
2155
2156 if (nested_svm_check_permissions(svm))
2157 return 1;
2158
2159 ret = kvm_skip_emulated_instruction(&svm->vcpu);
2160 svm_set_gif(svm, false);
2161 return ret;
2162 }
2163
invlpga_interception(struct vcpu_svm * svm)2164 static int invlpga_interception(struct vcpu_svm *svm)
2165 {
2166 struct kvm_vcpu *vcpu = &svm->vcpu;
2167
2168 trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
2169 kvm_rax_read(&svm->vcpu));
2170
2171 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2172 kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
2173
2174 return kvm_skip_emulated_instruction(&svm->vcpu);
2175 }
2176
skinit_interception(struct vcpu_svm * svm)2177 static int skinit_interception(struct vcpu_svm *svm)
2178 {
2179 trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
2180
2181 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2182 return 1;
2183 }
2184
wbinvd_interception(struct vcpu_svm * svm)2185 static int wbinvd_interception(struct vcpu_svm *svm)
2186 {
2187 return kvm_emulate_wbinvd(&svm->vcpu);
2188 }
2189
xsetbv_interception(struct vcpu_svm * svm)2190 static int xsetbv_interception(struct vcpu_svm *svm)
2191 {
2192 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2193 u32 index = kvm_rcx_read(&svm->vcpu);
2194
2195 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2196 return kvm_skip_emulated_instruction(&svm->vcpu);
2197 }
2198
2199 return 1;
2200 }
2201
rdpru_interception(struct vcpu_svm * svm)2202 static int rdpru_interception(struct vcpu_svm *svm)
2203 {
2204 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2205 return 1;
2206 }
2207
task_switch_interception(struct vcpu_svm * svm)2208 static int task_switch_interception(struct vcpu_svm *svm)
2209 {
2210 u16 tss_selector;
2211 int reason;
2212 int int_type = svm->vmcb->control.exit_int_info &
2213 SVM_EXITINTINFO_TYPE_MASK;
2214 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2215 uint32_t type =
2216 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2217 uint32_t idt_v =
2218 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2219 bool has_error_code = false;
2220 u32 error_code = 0;
2221
2222 tss_selector = (u16)svm->vmcb->control.exit_info_1;
2223
2224 if (svm->vmcb->control.exit_info_2 &
2225 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2226 reason = TASK_SWITCH_IRET;
2227 else if (svm->vmcb->control.exit_info_2 &
2228 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2229 reason = TASK_SWITCH_JMP;
2230 else if (idt_v)
2231 reason = TASK_SWITCH_GATE;
2232 else
2233 reason = TASK_SWITCH_CALL;
2234
2235 if (reason == TASK_SWITCH_GATE) {
2236 switch (type) {
2237 case SVM_EXITINTINFO_TYPE_NMI:
2238 svm->vcpu.arch.nmi_injected = false;
2239 break;
2240 case SVM_EXITINTINFO_TYPE_EXEPT:
2241 if (svm->vmcb->control.exit_info_2 &
2242 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2243 has_error_code = true;
2244 error_code =
2245 (u32)svm->vmcb->control.exit_info_2;
2246 }
2247 kvm_clear_exception_queue(&svm->vcpu);
2248 break;
2249 case SVM_EXITINTINFO_TYPE_INTR:
2250 kvm_clear_interrupt_queue(&svm->vcpu);
2251 break;
2252 default:
2253 break;
2254 }
2255 }
2256
2257 if (reason != TASK_SWITCH_GATE ||
2258 int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2259 (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2260 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2261 if (!skip_emulated_instruction(&svm->vcpu))
2262 return 0;
2263 }
2264
2265 if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2266 int_vec = -1;
2267
2268 return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2269 has_error_code, error_code);
2270 }
2271
cpuid_interception(struct vcpu_svm * svm)2272 static int cpuid_interception(struct vcpu_svm *svm)
2273 {
2274 return kvm_emulate_cpuid(&svm->vcpu);
2275 }
2276
iret_interception(struct vcpu_svm * svm)2277 static int iret_interception(struct vcpu_svm *svm)
2278 {
2279 ++svm->vcpu.stat.nmi_window_exits;
2280 svm_clr_intercept(svm, INTERCEPT_IRET);
2281 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2282 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2283 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2284 return 1;
2285 }
2286
invd_interception(struct vcpu_svm * svm)2287 static int invd_interception(struct vcpu_svm *svm)
2288 {
2289 /* Treat an INVD instruction as a NOP and just skip it. */
2290 return kvm_skip_emulated_instruction(&svm->vcpu);
2291 }
2292
invlpg_interception(struct vcpu_svm * svm)2293 static int invlpg_interception(struct vcpu_svm *svm)
2294 {
2295 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2296 return kvm_emulate_instruction(&svm->vcpu, 0);
2297
2298 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2299 return kvm_skip_emulated_instruction(&svm->vcpu);
2300 }
2301
emulate_on_interception(struct vcpu_svm * svm)2302 static int emulate_on_interception(struct vcpu_svm *svm)
2303 {
2304 return kvm_emulate_instruction(&svm->vcpu, 0);
2305 }
2306
rsm_interception(struct vcpu_svm * svm)2307 static int rsm_interception(struct vcpu_svm *svm)
2308 {
2309 return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
2310 }
2311
rdpmc_interception(struct vcpu_svm * svm)2312 static int rdpmc_interception(struct vcpu_svm *svm)
2313 {
2314 int err;
2315
2316 if (!nrips)
2317 return emulate_on_interception(svm);
2318
2319 err = kvm_rdpmc(&svm->vcpu);
2320 return kvm_complete_insn_gp(&svm->vcpu, err);
2321 }
2322
check_selective_cr0_intercepted(struct vcpu_svm * svm,unsigned long val)2323 static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
2324 unsigned long val)
2325 {
2326 unsigned long cr0 = svm->vcpu.arch.cr0;
2327 bool ret = false;
2328
2329 if (!is_guest_mode(&svm->vcpu) ||
2330 (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2331 return false;
2332
2333 cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2334 val &= ~SVM_CR0_SELECTIVE_MASK;
2335
2336 if (cr0 ^ val) {
2337 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2338 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2339 }
2340
2341 return ret;
2342 }
2343
2344 #define CR_VALID (1ULL << 63)
2345
cr_interception(struct vcpu_svm * svm)2346 static int cr_interception(struct vcpu_svm *svm)
2347 {
2348 int reg, cr;
2349 unsigned long val;
2350 int err;
2351
2352 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2353 return emulate_on_interception(svm);
2354
2355 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2356 return emulate_on_interception(svm);
2357
2358 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2359 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2360 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2361 else
2362 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2363
2364 err = 0;
2365 if (cr >= 16) { /* mov to cr */
2366 cr -= 16;
2367 val = kvm_register_readl(&svm->vcpu, reg);
2368 trace_kvm_cr_write(cr, val);
2369 switch (cr) {
2370 case 0:
2371 if (!check_selective_cr0_intercepted(svm, val))
2372 err = kvm_set_cr0(&svm->vcpu, val);
2373 else
2374 return 1;
2375
2376 break;
2377 case 3:
2378 err = kvm_set_cr3(&svm->vcpu, val);
2379 break;
2380 case 4:
2381 err = kvm_set_cr4(&svm->vcpu, val);
2382 break;
2383 case 8:
2384 err = kvm_set_cr8(&svm->vcpu, val);
2385 break;
2386 default:
2387 WARN(1, "unhandled write to CR%d", cr);
2388 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2389 return 1;
2390 }
2391 } else { /* mov from cr */
2392 switch (cr) {
2393 case 0:
2394 val = kvm_read_cr0(&svm->vcpu);
2395 break;
2396 case 2:
2397 val = svm->vcpu.arch.cr2;
2398 break;
2399 case 3:
2400 val = kvm_read_cr3(&svm->vcpu);
2401 break;
2402 case 4:
2403 val = kvm_read_cr4(&svm->vcpu);
2404 break;
2405 case 8:
2406 val = kvm_get_cr8(&svm->vcpu);
2407 break;
2408 default:
2409 WARN(1, "unhandled read from CR%d", cr);
2410 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2411 return 1;
2412 }
2413 kvm_register_writel(&svm->vcpu, reg, val);
2414 trace_kvm_cr_read(cr, val);
2415 }
2416 return kvm_complete_insn_gp(&svm->vcpu, err);
2417 }
2418
dr_interception(struct vcpu_svm * svm)2419 static int dr_interception(struct vcpu_svm *svm)
2420 {
2421 int reg, dr;
2422 unsigned long val;
2423
2424 if (svm->vcpu.guest_debug == 0) {
2425 /*
2426 * No more DR vmexits; force a reload of the debug registers
2427 * and reenter on this instruction. The next vmexit will
2428 * retrieve the full state of the debug registers.
2429 */
2430 clr_dr_intercepts(svm);
2431 svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2432 return 1;
2433 }
2434
2435 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2436 return emulate_on_interception(svm);
2437
2438 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2439 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2440
2441 if (dr >= 16) { /* mov to DRn */
2442 if (!kvm_require_dr(&svm->vcpu, dr - 16))
2443 return 1;
2444 val = kvm_register_readl(&svm->vcpu, reg);
2445 kvm_set_dr(&svm->vcpu, dr - 16, val);
2446 } else {
2447 if (!kvm_require_dr(&svm->vcpu, dr))
2448 return 1;
2449 kvm_get_dr(&svm->vcpu, dr, &val);
2450 kvm_register_writel(&svm->vcpu, reg, val);
2451 }
2452
2453 return kvm_skip_emulated_instruction(&svm->vcpu);
2454 }
2455
cr8_write_interception(struct vcpu_svm * svm)2456 static int cr8_write_interception(struct vcpu_svm *svm)
2457 {
2458 struct kvm_run *kvm_run = svm->vcpu.run;
2459 int r;
2460
2461 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2462 /* instruction emulation calls kvm_set_cr8() */
2463 r = cr_interception(svm);
2464 if (lapic_in_kernel(&svm->vcpu))
2465 return r;
2466 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2467 return r;
2468 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2469 return 0;
2470 }
2471
svm_get_msr_feature(struct kvm_msr_entry * msr)2472 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2473 {
2474 msr->data = 0;
2475
2476 switch (msr->index) {
2477 case MSR_F10H_DECFG:
2478 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
2479 msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
2480 break;
2481 case MSR_IA32_PERF_CAPABILITIES:
2482 return 0;
2483 default:
2484 return KVM_MSR_RET_INVALID;
2485 }
2486
2487 return 0;
2488 }
2489
svm_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2490 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2491 {
2492 struct vcpu_svm *svm = to_svm(vcpu);
2493
2494 switch (msr_info->index) {
2495 case MSR_STAR:
2496 msr_info->data = svm->vmcb->save.star;
2497 break;
2498 #ifdef CONFIG_X86_64
2499 case MSR_LSTAR:
2500 msr_info->data = svm->vmcb->save.lstar;
2501 break;
2502 case MSR_CSTAR:
2503 msr_info->data = svm->vmcb->save.cstar;
2504 break;
2505 case MSR_KERNEL_GS_BASE:
2506 msr_info->data = svm->vmcb->save.kernel_gs_base;
2507 break;
2508 case MSR_SYSCALL_MASK:
2509 msr_info->data = svm->vmcb->save.sfmask;
2510 break;
2511 #endif
2512 case MSR_IA32_SYSENTER_CS:
2513 msr_info->data = svm->vmcb->save.sysenter_cs;
2514 break;
2515 case MSR_IA32_SYSENTER_EIP:
2516 msr_info->data = svm->sysenter_eip;
2517 break;
2518 case MSR_IA32_SYSENTER_ESP:
2519 msr_info->data = svm->sysenter_esp;
2520 break;
2521 case MSR_TSC_AUX:
2522 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
2523 return 1;
2524 if (!msr_info->host_initiated &&
2525 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2526 return 1;
2527 msr_info->data = svm->tsc_aux;
2528 break;
2529 /*
2530 * Nobody will change the following 5 values in the VMCB so we can
2531 * safely return them on rdmsr. They will always be 0 until LBRV is
2532 * implemented.
2533 */
2534 case MSR_IA32_DEBUGCTLMSR:
2535 msr_info->data = svm->vmcb->save.dbgctl;
2536 break;
2537 case MSR_IA32_LASTBRANCHFROMIP:
2538 msr_info->data = svm->vmcb->save.br_from;
2539 break;
2540 case MSR_IA32_LASTBRANCHTOIP:
2541 msr_info->data = svm->vmcb->save.br_to;
2542 break;
2543 case MSR_IA32_LASTINTFROMIP:
2544 msr_info->data = svm->vmcb->save.last_excp_from;
2545 break;
2546 case MSR_IA32_LASTINTTOIP:
2547 msr_info->data = svm->vmcb->save.last_excp_to;
2548 break;
2549 case MSR_VM_HSAVE_PA:
2550 msr_info->data = svm->nested.hsave_msr;
2551 break;
2552 case MSR_VM_CR:
2553 msr_info->data = svm->nested.vm_cr_msr;
2554 break;
2555 case MSR_IA32_SPEC_CTRL:
2556 if (!msr_info->host_initiated &&
2557 !guest_has_spec_ctrl_msr(vcpu))
2558 return 1;
2559
2560 msr_info->data = svm->spec_ctrl;
2561 break;
2562 case MSR_AMD64_VIRT_SPEC_CTRL:
2563 if (!msr_info->host_initiated &&
2564 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2565 return 1;
2566
2567 msr_info->data = svm->virt_spec_ctrl;
2568 break;
2569 case MSR_F15H_IC_CFG: {
2570
2571 int family, model;
2572
2573 family = guest_cpuid_family(vcpu);
2574 model = guest_cpuid_model(vcpu);
2575
2576 if (family < 0 || model < 0)
2577 return kvm_get_msr_common(vcpu, msr_info);
2578
2579 msr_info->data = 0;
2580
2581 if (family == 0x15 &&
2582 (model >= 0x2 && model < 0x20))
2583 msr_info->data = 0x1E;
2584 }
2585 break;
2586 case MSR_F10H_DECFG:
2587 msr_info->data = svm->msr_decfg;
2588 break;
2589 default:
2590 return kvm_get_msr_common(vcpu, msr_info);
2591 }
2592 return 0;
2593 }
2594
rdmsr_interception(struct vcpu_svm * svm)2595 static int rdmsr_interception(struct vcpu_svm *svm)
2596 {
2597 return kvm_emulate_rdmsr(&svm->vcpu);
2598 }
2599
svm_set_vm_cr(struct kvm_vcpu * vcpu,u64 data)2600 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2601 {
2602 struct vcpu_svm *svm = to_svm(vcpu);
2603 int svm_dis, chg_mask;
2604
2605 if (data & ~SVM_VM_CR_VALID_MASK)
2606 return 1;
2607
2608 chg_mask = SVM_VM_CR_VALID_MASK;
2609
2610 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2611 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2612
2613 svm->nested.vm_cr_msr &= ~chg_mask;
2614 svm->nested.vm_cr_msr |= (data & chg_mask);
2615
2616 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2617
2618 /* check for svm_disable while efer.svme is set */
2619 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2620 return 1;
2621
2622 return 0;
2623 }
2624
svm_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2625 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2626 {
2627 struct vcpu_svm *svm = to_svm(vcpu);
2628
2629 u32 ecx = msr->index;
2630 u64 data = msr->data;
2631 switch (ecx) {
2632 case MSR_IA32_CR_PAT:
2633 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2634 return 1;
2635 vcpu->arch.pat = data;
2636 svm->vmcb->save.g_pat = data;
2637 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2638 break;
2639 case MSR_IA32_SPEC_CTRL:
2640 if (!msr->host_initiated &&
2641 !guest_has_spec_ctrl_msr(vcpu))
2642 return 1;
2643
2644 if (kvm_spec_ctrl_test_value(data))
2645 return 1;
2646
2647 svm->spec_ctrl = data;
2648 if (!data)
2649 break;
2650
2651 /*
2652 * For non-nested:
2653 * When it's written (to non-zero) for the first time, pass
2654 * it through.
2655 *
2656 * For nested:
2657 * The handling of the MSR bitmap for L2 guests is done in
2658 * nested_svm_vmrun_msrpm.
2659 * We update the L1 MSR bit as well since it will end up
2660 * touching the MSR anyway now.
2661 */
2662 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2663 break;
2664 case MSR_IA32_PRED_CMD:
2665 if (!msr->host_initiated &&
2666 !guest_has_pred_cmd_msr(vcpu))
2667 return 1;
2668
2669 if (data & ~PRED_CMD_IBPB)
2670 return 1;
2671 if (!boot_cpu_has(X86_FEATURE_IBPB))
2672 return 1;
2673 if (!data)
2674 break;
2675
2676 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2677 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
2678 break;
2679 case MSR_AMD64_VIRT_SPEC_CTRL:
2680 if (!msr->host_initiated &&
2681 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2682 return 1;
2683
2684 if (data & ~SPEC_CTRL_SSBD)
2685 return 1;
2686
2687 svm->virt_spec_ctrl = data;
2688 break;
2689 case MSR_STAR:
2690 svm->vmcb->save.star = data;
2691 break;
2692 #ifdef CONFIG_X86_64
2693 case MSR_LSTAR:
2694 svm->vmcb->save.lstar = data;
2695 break;
2696 case MSR_CSTAR:
2697 svm->vmcb->save.cstar = data;
2698 break;
2699 case MSR_KERNEL_GS_BASE:
2700 svm->vmcb->save.kernel_gs_base = data;
2701 break;
2702 case MSR_SYSCALL_MASK:
2703 svm->vmcb->save.sfmask = data;
2704 break;
2705 #endif
2706 case MSR_IA32_SYSENTER_CS:
2707 svm->vmcb->save.sysenter_cs = data;
2708 break;
2709 case MSR_IA32_SYSENTER_EIP:
2710 svm->sysenter_eip = data;
2711 svm->vmcb->save.sysenter_eip = data;
2712 break;
2713 case MSR_IA32_SYSENTER_ESP:
2714 svm->sysenter_esp = data;
2715 svm->vmcb->save.sysenter_esp = data;
2716 break;
2717 case MSR_TSC_AUX:
2718 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
2719 return 1;
2720
2721 if (!msr->host_initiated &&
2722 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2723 return 1;
2724
2725 /*
2726 * This is rare, so we update the MSR here instead of using
2727 * direct_access_msrs. Doing that would require a rdmsr in
2728 * svm_vcpu_put.
2729 */
2730 svm->tsc_aux = data;
2731 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
2732 break;
2733 case MSR_IA32_DEBUGCTLMSR:
2734 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2735 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2736 __func__, data);
2737 break;
2738 }
2739 if (data & DEBUGCTL_RESERVED_BITS)
2740 return 1;
2741
2742 svm->vmcb->save.dbgctl = data;
2743 vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
2744 if (data & (1ULL<<0))
2745 svm_enable_lbrv(vcpu);
2746 else
2747 svm_disable_lbrv(vcpu);
2748 break;
2749 case MSR_VM_HSAVE_PA:
2750 /*
2751 * Old kernels did not validate the value written to
2752 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
2753 * value to allow live migrating buggy or malicious guests
2754 * originating from those kernels.
2755 */
2756 if (!msr->host_initiated && !page_address_valid(vcpu, data))
2757 return 1;
2758
2759 svm->nested.hsave_msr = data & PAGE_MASK;
2760 break;
2761 case MSR_VM_CR:
2762 return svm_set_vm_cr(vcpu, data);
2763 case MSR_VM_IGNNE:
2764 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2765 break;
2766 case MSR_F10H_DECFG: {
2767 struct kvm_msr_entry msr_entry;
2768
2769 msr_entry.index = msr->index;
2770 if (svm_get_msr_feature(&msr_entry))
2771 return 1;
2772
2773 /* Check the supported bits */
2774 if (data & ~msr_entry.data)
2775 return 1;
2776
2777 /* Don't allow the guest to change a bit, #GP */
2778 if (!msr->host_initiated && (data ^ msr_entry.data))
2779 return 1;
2780
2781 svm->msr_decfg = data;
2782 break;
2783 }
2784 case MSR_IA32_APICBASE:
2785 if (kvm_vcpu_apicv_active(vcpu))
2786 avic_update_vapic_bar(to_svm(vcpu), data);
2787 fallthrough;
2788 default:
2789 return kvm_set_msr_common(vcpu, msr);
2790 }
2791 return 0;
2792 }
2793
wrmsr_interception(struct vcpu_svm * svm)2794 static int wrmsr_interception(struct vcpu_svm *svm)
2795 {
2796 return kvm_emulate_wrmsr(&svm->vcpu);
2797 }
2798
msr_interception(struct vcpu_svm * svm)2799 static int msr_interception(struct vcpu_svm *svm)
2800 {
2801 if (svm->vmcb->control.exit_info_1)
2802 return wrmsr_interception(svm);
2803 else
2804 return rdmsr_interception(svm);
2805 }
2806
interrupt_window_interception(struct vcpu_svm * svm)2807 static int interrupt_window_interception(struct vcpu_svm *svm)
2808 {
2809 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2810 svm_clear_vintr(svm);
2811
2812 /*
2813 * For AVIC, the only reason to end up here is ExtINTs.
2814 * In this case AVIC was temporarily disabled for
2815 * requesting the IRQ window and we have to re-enable it.
2816 */
2817 svm_toggle_avic_for_irq_window(&svm->vcpu, true);
2818
2819 ++svm->vcpu.stat.irq_window_exits;
2820 return 1;
2821 }
2822
pause_interception(struct vcpu_svm * svm)2823 static int pause_interception(struct vcpu_svm *svm)
2824 {
2825 struct kvm_vcpu *vcpu = &svm->vcpu;
2826 bool in_kernel = (svm_get_cpl(vcpu) == 0);
2827
2828 if (!kvm_pause_in_guest(vcpu->kvm))
2829 grow_ple_window(vcpu);
2830
2831 kvm_vcpu_on_spin(vcpu, in_kernel);
2832 return 1;
2833 }
2834
nop_interception(struct vcpu_svm * svm)2835 static int nop_interception(struct vcpu_svm *svm)
2836 {
2837 return kvm_skip_emulated_instruction(&(svm->vcpu));
2838 }
2839
monitor_interception(struct vcpu_svm * svm)2840 static int monitor_interception(struct vcpu_svm *svm)
2841 {
2842 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
2843 return nop_interception(svm);
2844 }
2845
mwait_interception(struct vcpu_svm * svm)2846 static int mwait_interception(struct vcpu_svm *svm)
2847 {
2848 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
2849 return nop_interception(svm);
2850 }
2851
invpcid_interception(struct vcpu_svm * svm)2852 static int invpcid_interception(struct vcpu_svm *svm)
2853 {
2854 struct kvm_vcpu *vcpu = &svm->vcpu;
2855 unsigned long type;
2856 gva_t gva;
2857
2858 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
2859 kvm_queue_exception(vcpu, UD_VECTOR);
2860 return 1;
2861 }
2862
2863 /*
2864 * For an INVPCID intercept:
2865 * EXITINFO1 provides the linear address of the memory operand.
2866 * EXITINFO2 provides the contents of the register operand.
2867 */
2868 type = svm->vmcb->control.exit_info_2;
2869 gva = svm->vmcb->control.exit_info_1;
2870
2871 if (type > 3) {
2872 kvm_inject_gp(vcpu, 0);
2873 return 1;
2874 }
2875
2876 return kvm_handle_invpcid(vcpu, type, gva);
2877 }
2878
2879 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
2880 [SVM_EXIT_READ_CR0] = cr_interception,
2881 [SVM_EXIT_READ_CR3] = cr_interception,
2882 [SVM_EXIT_READ_CR4] = cr_interception,
2883 [SVM_EXIT_READ_CR8] = cr_interception,
2884 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
2885 [SVM_EXIT_WRITE_CR0] = cr_interception,
2886 [SVM_EXIT_WRITE_CR3] = cr_interception,
2887 [SVM_EXIT_WRITE_CR4] = cr_interception,
2888 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2889 [SVM_EXIT_READ_DR0] = dr_interception,
2890 [SVM_EXIT_READ_DR1] = dr_interception,
2891 [SVM_EXIT_READ_DR2] = dr_interception,
2892 [SVM_EXIT_READ_DR3] = dr_interception,
2893 [SVM_EXIT_READ_DR4] = dr_interception,
2894 [SVM_EXIT_READ_DR5] = dr_interception,
2895 [SVM_EXIT_READ_DR6] = dr_interception,
2896 [SVM_EXIT_READ_DR7] = dr_interception,
2897 [SVM_EXIT_WRITE_DR0] = dr_interception,
2898 [SVM_EXIT_WRITE_DR1] = dr_interception,
2899 [SVM_EXIT_WRITE_DR2] = dr_interception,
2900 [SVM_EXIT_WRITE_DR3] = dr_interception,
2901 [SVM_EXIT_WRITE_DR4] = dr_interception,
2902 [SVM_EXIT_WRITE_DR5] = dr_interception,
2903 [SVM_EXIT_WRITE_DR6] = dr_interception,
2904 [SVM_EXIT_WRITE_DR7] = dr_interception,
2905 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2906 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2907 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
2908 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
2909 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
2910 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
2911 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
2912 [SVM_EXIT_INTR] = intr_interception,
2913 [SVM_EXIT_NMI] = nmi_interception,
2914 [SVM_EXIT_SMI] = nop_on_interception,
2915 [SVM_EXIT_INIT] = nop_on_interception,
2916 [SVM_EXIT_VINTR] = interrupt_window_interception,
2917 [SVM_EXIT_RDPMC] = rdpmc_interception,
2918 [SVM_EXIT_CPUID] = cpuid_interception,
2919 [SVM_EXIT_IRET] = iret_interception,
2920 [SVM_EXIT_INVD] = invd_interception,
2921 [SVM_EXIT_PAUSE] = pause_interception,
2922 [SVM_EXIT_HLT] = halt_interception,
2923 [SVM_EXIT_INVLPG] = invlpg_interception,
2924 [SVM_EXIT_INVLPGA] = invlpga_interception,
2925 [SVM_EXIT_IOIO] = io_interception,
2926 [SVM_EXIT_MSR] = msr_interception,
2927 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
2928 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
2929 [SVM_EXIT_VMRUN] = vmrun_interception,
2930 [SVM_EXIT_VMMCALL] = vmmcall_interception,
2931 [SVM_EXIT_VMLOAD] = vmload_interception,
2932 [SVM_EXIT_VMSAVE] = vmsave_interception,
2933 [SVM_EXIT_STGI] = stgi_interception,
2934 [SVM_EXIT_CLGI] = clgi_interception,
2935 [SVM_EXIT_SKINIT] = skinit_interception,
2936 [SVM_EXIT_WBINVD] = wbinvd_interception,
2937 [SVM_EXIT_MONITOR] = monitor_interception,
2938 [SVM_EXIT_MWAIT] = mwait_interception,
2939 [SVM_EXIT_XSETBV] = xsetbv_interception,
2940 [SVM_EXIT_RDPRU] = rdpru_interception,
2941 [SVM_EXIT_INVPCID] = invpcid_interception,
2942 [SVM_EXIT_NPF] = npf_interception,
2943 [SVM_EXIT_RSM] = rsm_interception,
2944 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
2945 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
2946 };
2947
dump_vmcb(struct kvm_vcpu * vcpu)2948 static void dump_vmcb(struct kvm_vcpu *vcpu)
2949 {
2950 struct vcpu_svm *svm = to_svm(vcpu);
2951 struct vmcb_control_area *control = &svm->vmcb->control;
2952 struct vmcb_save_area *save = &svm->vmcb->save;
2953
2954 if (!dump_invalid_vmcb) {
2955 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
2956 return;
2957 }
2958
2959 pr_err("VMCB Control Area:\n");
2960 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
2961 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
2962 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
2963 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
2964 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
2965 pr_err("%-20s%08x %08x\n", "intercepts:",
2966 control->intercepts[INTERCEPT_WORD3],
2967 control->intercepts[INTERCEPT_WORD4]);
2968 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
2969 pr_err("%-20s%d\n", "pause filter threshold:",
2970 control->pause_filter_thresh);
2971 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
2972 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
2973 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
2974 pr_err("%-20s%d\n", "asid:", control->asid);
2975 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
2976 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
2977 pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
2978 pr_err("%-20s%08x\n", "int_state:", control->int_state);
2979 pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
2980 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
2981 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
2982 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
2983 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
2984 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
2985 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
2986 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
2987 pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
2988 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
2989 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
2990 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
2991 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
2992 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
2993 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
2994 pr_err("VMCB State Save Area:\n");
2995 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2996 "es:",
2997 save->es.selector, save->es.attrib,
2998 save->es.limit, save->es.base);
2999 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3000 "cs:",
3001 save->cs.selector, save->cs.attrib,
3002 save->cs.limit, save->cs.base);
3003 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3004 "ss:",
3005 save->ss.selector, save->ss.attrib,
3006 save->ss.limit, save->ss.base);
3007 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3008 "ds:",
3009 save->ds.selector, save->ds.attrib,
3010 save->ds.limit, save->ds.base);
3011 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3012 "fs:",
3013 save->fs.selector, save->fs.attrib,
3014 save->fs.limit, save->fs.base);
3015 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3016 "gs:",
3017 save->gs.selector, save->gs.attrib,
3018 save->gs.limit, save->gs.base);
3019 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3020 "gdtr:",
3021 save->gdtr.selector, save->gdtr.attrib,
3022 save->gdtr.limit, save->gdtr.base);
3023 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3024 "ldtr:",
3025 save->ldtr.selector, save->ldtr.attrib,
3026 save->ldtr.limit, save->ldtr.base);
3027 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3028 "idtr:",
3029 save->idtr.selector, save->idtr.attrib,
3030 save->idtr.limit, save->idtr.base);
3031 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3032 "tr:",
3033 save->tr.selector, save->tr.attrib,
3034 save->tr.limit, save->tr.base);
3035 pr_err("cpl: %d efer: %016llx\n",
3036 save->cpl, save->efer);
3037 pr_err("%-15s %016llx %-13s %016llx\n",
3038 "cr0:", save->cr0, "cr2:", save->cr2);
3039 pr_err("%-15s %016llx %-13s %016llx\n",
3040 "cr3:", save->cr3, "cr4:", save->cr4);
3041 pr_err("%-15s %016llx %-13s %016llx\n",
3042 "dr6:", save->dr6, "dr7:", save->dr7);
3043 pr_err("%-15s %016llx %-13s %016llx\n",
3044 "rip:", save->rip, "rflags:", save->rflags);
3045 pr_err("%-15s %016llx %-13s %016llx\n",
3046 "rsp:", save->rsp, "rax:", save->rax);
3047 pr_err("%-15s %016llx %-13s %016llx\n",
3048 "star:", save->star, "lstar:", save->lstar);
3049 pr_err("%-15s %016llx %-13s %016llx\n",
3050 "cstar:", save->cstar, "sfmask:", save->sfmask);
3051 pr_err("%-15s %016llx %-13s %016llx\n",
3052 "kernel_gs_base:", save->kernel_gs_base,
3053 "sysenter_cs:", save->sysenter_cs);
3054 pr_err("%-15s %016llx %-13s %016llx\n",
3055 "sysenter_esp:", save->sysenter_esp,
3056 "sysenter_eip:", save->sysenter_eip);
3057 pr_err("%-15s %016llx %-13s %016llx\n",
3058 "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3059 pr_err("%-15s %016llx %-13s %016llx\n",
3060 "br_from:", save->br_from, "br_to:", save->br_to);
3061 pr_err("%-15s %016llx %-13s %016llx\n",
3062 "excp_from:", save->last_excp_from,
3063 "excp_to:", save->last_excp_to);
3064 }
3065
svm_get_exit_info(struct kvm_vcpu * vcpu,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)3066 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
3067 u32 *intr_info, u32 *error_code)
3068 {
3069 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3070
3071 *info1 = control->exit_info_1;
3072 *info2 = control->exit_info_2;
3073 *intr_info = control->exit_int_info;
3074 if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3075 (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3076 *error_code = control->exit_int_info_err;
3077 else
3078 *error_code = 0;
3079 }
3080
handle_exit(struct kvm_vcpu * vcpu,fastpath_t exit_fastpath)3081 static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3082 {
3083 struct vcpu_svm *svm = to_svm(vcpu);
3084 struct kvm_run *kvm_run = vcpu->run;
3085 u32 exit_code = svm->vmcb->control.exit_code;
3086
3087 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
3088
3089 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3090 vcpu->arch.cr0 = svm->vmcb->save.cr0;
3091 if (npt_enabled)
3092 vcpu->arch.cr3 = svm->vmcb->save.cr3;
3093
3094 if (is_guest_mode(vcpu)) {
3095 int vmexit;
3096
3097 trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);
3098
3099 vmexit = nested_svm_exit_special(svm);
3100
3101 if (vmexit == NESTED_EXIT_CONTINUE)
3102 vmexit = nested_svm_exit_handled(svm);
3103
3104 if (vmexit == NESTED_EXIT_DONE)
3105 return 1;
3106 }
3107
3108 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3109 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3110 kvm_run->fail_entry.hardware_entry_failure_reason
3111 = svm->vmcb->control.exit_code;
3112 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3113 dump_vmcb(vcpu);
3114 return 0;
3115 }
3116
3117 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3118 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3119 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3120 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3121 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3122 "exit_code 0x%x\n",
3123 __func__, svm->vmcb->control.exit_int_info,
3124 exit_code);
3125
3126 if (exit_fastpath != EXIT_FASTPATH_NONE)
3127 return 1;
3128
3129 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
3130 || !svm_exit_handlers[exit_code]) {
3131 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
3132 dump_vmcb(vcpu);
3133 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3134 vcpu->run->internal.suberror =
3135 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3136 vcpu->run->internal.ndata = 2;
3137 vcpu->run->internal.data[0] = exit_code;
3138 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3139 return 0;
3140 }
3141
3142 #ifdef CONFIG_RETPOLINE
3143 if (exit_code == SVM_EXIT_MSR)
3144 return msr_interception(svm);
3145 else if (exit_code == SVM_EXIT_VINTR)
3146 return interrupt_window_interception(svm);
3147 else if (exit_code == SVM_EXIT_INTR)
3148 return intr_interception(svm);
3149 else if (exit_code == SVM_EXIT_HLT)
3150 return halt_interception(svm);
3151 else if (exit_code == SVM_EXIT_NPF)
3152 return npf_interception(svm);
3153 #endif
3154 return svm_exit_handlers[exit_code](svm);
3155 }
3156
reload_tss(struct kvm_vcpu * vcpu)3157 static void reload_tss(struct kvm_vcpu *vcpu)
3158 {
3159 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3160
3161 sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3162 load_TR_desc();
3163 }
3164
pre_svm_run(struct vcpu_svm * svm)3165 static void pre_svm_run(struct vcpu_svm *svm)
3166 {
3167 struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
3168
3169 if (sev_guest(svm->vcpu.kvm))
3170 return pre_sev_run(svm, svm->vcpu.cpu);
3171
3172 /* FIXME: handle wraparound of asid_generation */
3173 if (svm->asid_generation != sd->asid_generation)
3174 new_asid(svm, sd);
3175 }
3176
svm_inject_nmi(struct kvm_vcpu * vcpu)3177 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3178 {
3179 struct vcpu_svm *svm = to_svm(vcpu);
3180
3181 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3182 vcpu->arch.hflags |= HF_NMI_MASK;
3183 svm_set_intercept(svm, INTERCEPT_IRET);
3184 ++vcpu->stat.nmi_injections;
3185 }
3186
svm_set_irq(struct kvm_vcpu * vcpu)3187 static void svm_set_irq(struct kvm_vcpu *vcpu)
3188 {
3189 struct vcpu_svm *svm = to_svm(vcpu);
3190
3191 BUG_ON(!(gif_set(svm)));
3192
3193 trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
3194 ++vcpu->stat.irq_injections;
3195
3196 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3197 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
3198 }
3199
update_cr8_intercept(struct kvm_vcpu * vcpu,int tpr,int irr)3200 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3201 {
3202 struct vcpu_svm *svm = to_svm(vcpu);
3203
3204 if (nested_svm_virtualize_tpr(vcpu))
3205 return;
3206
3207 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3208
3209 if (irr == -1)
3210 return;
3211
3212 if (tpr >= irr)
3213 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3214 }
3215
svm_nmi_blocked(struct kvm_vcpu * vcpu)3216 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3217 {
3218 struct vcpu_svm *svm = to_svm(vcpu);
3219 struct vmcb *vmcb = svm->vmcb;
3220 bool ret;
3221
3222 if (!gif_set(svm))
3223 return true;
3224
3225 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3226 return false;
3227
3228 ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
3229 (svm->vcpu.arch.hflags & HF_NMI_MASK);
3230
3231 return ret;
3232 }
3233
svm_nmi_allowed(struct kvm_vcpu * vcpu,bool for_injection)3234 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3235 {
3236 struct vcpu_svm *svm = to_svm(vcpu);
3237 if (svm->nested.nested_run_pending)
3238 return -EBUSY;
3239
3240 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
3241 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3242 return -EBUSY;
3243
3244 return !svm_nmi_blocked(vcpu);
3245 }
3246
svm_get_nmi_mask(struct kvm_vcpu * vcpu)3247 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3248 {
3249 struct vcpu_svm *svm = to_svm(vcpu);
3250
3251 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
3252 }
3253
svm_set_nmi_mask(struct kvm_vcpu * vcpu,bool masked)3254 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3255 {
3256 struct vcpu_svm *svm = to_svm(vcpu);
3257
3258 if (masked) {
3259 svm->vcpu.arch.hflags |= HF_NMI_MASK;
3260 svm_set_intercept(svm, INTERCEPT_IRET);
3261 } else {
3262 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3263 svm_clr_intercept(svm, INTERCEPT_IRET);
3264 }
3265 }
3266
svm_interrupt_blocked(struct kvm_vcpu * vcpu)3267 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3268 {
3269 struct vcpu_svm *svm = to_svm(vcpu);
3270 struct vmcb *vmcb = svm->vmcb;
3271
3272 if (!gif_set(svm))
3273 return true;
3274
3275 if (is_guest_mode(vcpu)) {
3276 /* As long as interrupts are being delivered... */
3277 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3278 ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
3279 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3280 return true;
3281
3282 /* ... vmexits aren't blocked by the interrupt shadow */
3283 if (nested_exit_on_intr(svm))
3284 return false;
3285 } else {
3286 if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3287 return true;
3288 }
3289
3290 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3291 }
3292
svm_interrupt_allowed(struct kvm_vcpu * vcpu,bool for_injection)3293 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3294 {
3295 struct vcpu_svm *svm = to_svm(vcpu);
3296 if (svm->nested.nested_run_pending)
3297 return -EBUSY;
3298
3299 /*
3300 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3301 * e.g. if the IRQ arrived asynchronously after checking nested events.
3302 */
3303 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3304 return -EBUSY;
3305
3306 return !svm_interrupt_blocked(vcpu);
3307 }
3308
enable_irq_window(struct kvm_vcpu * vcpu)3309 static void enable_irq_window(struct kvm_vcpu *vcpu)
3310 {
3311 struct vcpu_svm *svm = to_svm(vcpu);
3312
3313 /*
3314 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3315 * 1, because that's a separate STGI/VMRUN intercept. The next time we
3316 * get that intercept, this function will be called again though and
3317 * we'll get the vintr intercept. However, if the vGIF feature is
3318 * enabled, the STGI interception will not occur. Enable the irq
3319 * window under the assumption that the hardware will set the GIF.
3320 */
3321 if (vgif_enabled(svm) || gif_set(svm)) {
3322 /*
3323 * IRQ window is not needed when AVIC is enabled,
3324 * unless we have pending ExtINT since it cannot be injected
3325 * via AVIC. In such case, we need to temporarily disable AVIC,
3326 * and fallback to injecting IRQ via V_IRQ.
3327 */
3328 svm_toggle_avic_for_irq_window(vcpu, false);
3329 svm_set_vintr(svm);
3330 }
3331 }
3332
enable_nmi_window(struct kvm_vcpu * vcpu)3333 static void enable_nmi_window(struct kvm_vcpu *vcpu)
3334 {
3335 struct vcpu_svm *svm = to_svm(vcpu);
3336
3337 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
3338 == HF_NMI_MASK)
3339 return; /* IRET will cause a vm exit */
3340
3341 if (!gif_set(svm)) {
3342 if (vgif_enabled(svm))
3343 svm_set_intercept(svm, INTERCEPT_STGI);
3344 return; /* STGI will cause a vm exit */
3345 }
3346
3347 /*
3348 * Something prevents NMI from been injected. Single step over possible
3349 * problem (IRET or exception injection or interrupt shadow)
3350 */
3351 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3352 svm->nmi_singlestep = true;
3353 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3354 }
3355
svm_set_tss_addr(struct kvm * kvm,unsigned int addr)3356 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3357 {
3358 return 0;
3359 }
3360
svm_set_identity_map_addr(struct kvm * kvm,u64 ident_addr)3361 static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
3362 {
3363 return 0;
3364 }
3365
svm_flush_tlb(struct kvm_vcpu * vcpu)3366 void svm_flush_tlb(struct kvm_vcpu *vcpu)
3367 {
3368 struct vcpu_svm *svm = to_svm(vcpu);
3369
3370 /*
3371 * Flush only the current ASID even if the TLB flush was invoked via
3372 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
3373 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3374 * unconditionally does a TLB flush on both nested VM-Enter and nested
3375 * VM-Exit (via kvm_mmu_reset_context()).
3376 */
3377 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3378 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3379 else
3380 svm->asid_generation--;
3381 }
3382
svm_flush_tlb_gva(struct kvm_vcpu * vcpu,gva_t gva)3383 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3384 {
3385 struct vcpu_svm *svm = to_svm(vcpu);
3386
3387 invlpga(gva, svm->vmcb->control.asid);
3388 }
3389
svm_prepare_guest_switch(struct kvm_vcpu * vcpu)3390 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
3391 {
3392 }
3393
sync_cr8_to_lapic(struct kvm_vcpu * vcpu)3394 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3395 {
3396 struct vcpu_svm *svm = to_svm(vcpu);
3397
3398 if (nested_svm_virtualize_tpr(vcpu))
3399 return;
3400
3401 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3402 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3403 kvm_set_cr8(vcpu, cr8);
3404 }
3405 }
3406
sync_lapic_to_cr8(struct kvm_vcpu * vcpu)3407 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3408 {
3409 struct vcpu_svm *svm = to_svm(vcpu);
3410 u64 cr8;
3411
3412 if (nested_svm_virtualize_tpr(vcpu) ||
3413 kvm_vcpu_apicv_active(vcpu))
3414 return;
3415
3416 cr8 = kvm_get_cr8(vcpu);
3417 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3418 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3419 }
3420
svm_complete_interrupts(struct vcpu_svm * svm)3421 static void svm_complete_interrupts(struct vcpu_svm *svm)
3422 {
3423 u8 vector;
3424 int type;
3425 u32 exitintinfo = svm->vmcb->control.exit_int_info;
3426 unsigned int3_injected = svm->int3_injected;
3427
3428 svm->int3_injected = 0;
3429
3430 /*
3431 * If we've made progress since setting HF_IRET_MASK, we've
3432 * executed an IRET and can allow NMI injection.
3433 */
3434 if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3435 && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3436 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3437 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3438 }
3439
3440 svm->vcpu.arch.nmi_injected = false;
3441 kvm_clear_exception_queue(&svm->vcpu);
3442 kvm_clear_interrupt_queue(&svm->vcpu);
3443
3444 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3445 return;
3446
3447 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3448
3449 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3450 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3451
3452 switch (type) {
3453 case SVM_EXITINTINFO_TYPE_NMI:
3454 svm->vcpu.arch.nmi_injected = true;
3455 break;
3456 case SVM_EXITINTINFO_TYPE_EXEPT:
3457 /*
3458 * In case of software exceptions, do not reinject the vector,
3459 * but re-execute the instruction instead. Rewind RIP first
3460 * if we emulated INT3 before.
3461 */
3462 if (kvm_exception_is_soft(vector)) {
3463 if (vector == BP_VECTOR && int3_injected &&
3464 kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
3465 kvm_rip_write(&svm->vcpu,
3466 kvm_rip_read(&svm->vcpu) -
3467 int3_injected);
3468 break;
3469 }
3470 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3471 u32 err = svm->vmcb->control.exit_int_info_err;
3472 kvm_requeue_exception_e(&svm->vcpu, vector, err);
3473
3474 } else
3475 kvm_requeue_exception(&svm->vcpu, vector);
3476 break;
3477 case SVM_EXITINTINFO_TYPE_INTR:
3478 kvm_queue_interrupt(&svm->vcpu, vector, false);
3479 break;
3480 default:
3481 break;
3482 }
3483 }
3484
svm_cancel_injection(struct kvm_vcpu * vcpu)3485 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3486 {
3487 struct vcpu_svm *svm = to_svm(vcpu);
3488 struct vmcb_control_area *control = &svm->vmcb->control;
3489
3490 control->exit_int_info = control->event_inj;
3491 control->exit_int_info_err = control->event_inj_err;
3492 control->event_inj = 0;
3493 svm_complete_interrupts(svm);
3494 }
3495
svm_exit_handlers_fastpath(struct kvm_vcpu * vcpu)3496 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
3497 {
3498 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
3499 to_svm(vcpu)->vmcb->control.exit_info_1)
3500 return handle_fastpath_set_msr_irqoff(vcpu);
3501
3502 return EXIT_FASTPATH_NONE;
3503 }
3504
3505 void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
3506
svm_vcpu_enter_exit(struct kvm_vcpu * vcpu,struct vcpu_svm * svm)3507 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
3508 struct vcpu_svm *svm)
3509 {
3510 /*
3511 * VMENTER enables interrupts (host state), but the kernel state is
3512 * interrupts disabled when this is invoked. Also tell RCU about
3513 * it. This is the same logic as for exit_to_user_mode().
3514 *
3515 * This ensures that e.g. latency analysis on the host observes
3516 * guest mode as interrupt enabled.
3517 *
3518 * guest_enter_irqoff() informs context tracking about the
3519 * transition to guest mode and if enabled adjusts RCU state
3520 * accordingly.
3521 */
3522 instrumentation_begin();
3523 trace_hardirqs_on_prepare();
3524 lockdep_hardirqs_on_prepare(CALLER_ADDR0);
3525 instrumentation_end();
3526
3527 guest_enter_irqoff();
3528 lockdep_hardirqs_on(CALLER_ADDR0);
3529
3530 __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
3531
3532 #ifdef CONFIG_X86_64
3533 native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3534 #else
3535 loadsegment(fs, svm->host.fs);
3536 #ifndef CONFIG_X86_32_LAZY_GS
3537 loadsegment(gs, svm->host.gs);
3538 #endif
3539 #endif
3540
3541 /*
3542 * VMEXIT disables interrupts (host state), but tracing and lockdep
3543 * have them in state 'on' as recorded before entering guest mode.
3544 * Same as enter_from_user_mode().
3545 *
3546 * context_tracking_guest_exit() restores host context and reinstates
3547 * RCU if enabled and required.
3548 *
3549 * This needs to be done before the below as native_read_msr()
3550 * contains a tracepoint and x86_spec_ctrl_restore_host() calls
3551 * into world and some more.
3552 */
3553 lockdep_hardirqs_off(CALLER_ADDR0);
3554 context_tracking_guest_exit();
3555
3556 instrumentation_begin();
3557 trace_hardirqs_off_finish();
3558 instrumentation_end();
3559 }
3560
svm_vcpu_run(struct kvm_vcpu * vcpu)3561 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
3562 {
3563 struct vcpu_svm *svm = to_svm(vcpu);
3564
3565 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3566 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3567 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3568
3569 /*
3570 * Disable singlestep if we're injecting an interrupt/exception.
3571 * We don't want our modified rflags to be pushed on the stack where
3572 * we might not be able to easily reset them if we disabled NMI
3573 * singlestep later.
3574 */
3575 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3576 /*
3577 * Event injection happens before external interrupts cause a
3578 * vmexit and interrupts are disabled here, so smp_send_reschedule
3579 * is enough to force an immediate vmexit.
3580 */
3581 disable_nmi_singlestep(svm);
3582 smp_send_reschedule(vcpu->cpu);
3583 }
3584
3585 pre_svm_run(svm);
3586
3587 sync_lapic_to_cr8(vcpu);
3588
3589 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3590
3591 /*
3592 * Run with all-zero DR6 unless needed, so that we can get the exact cause
3593 * of a #DB.
3594 */
3595 if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
3596 svm_set_dr6(svm, vcpu->arch.dr6);
3597 else
3598 svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM);
3599
3600 clgi();
3601 kvm_load_guest_xsave_state(vcpu);
3602
3603 kvm_wait_lapic_expire(vcpu);
3604
3605 /*
3606 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3607 * it's non-zero. Since vmentry is serialising on affected CPUs, there
3608 * is no need to worry about the conditional branch over the wrmsr
3609 * being speculatively taken.
3610 */
3611 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
3612
3613 svm_vcpu_enter_exit(vcpu, svm);
3614
3615 /*
3616 * We do not use IBRS in the kernel. If this vCPU has used the
3617 * SPEC_CTRL MSR it may have left it on; save the value and
3618 * turn it off. This is much more efficient than blindly adding
3619 * it to the atomic save/restore list. Especially as the former
3620 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
3621 *
3622 * For non-nested case:
3623 * If the L01 MSR bitmap does not intercept the MSR, then we need to
3624 * save it.
3625 *
3626 * For nested case:
3627 * If the L02 MSR bitmap does not intercept the MSR, then we need to
3628 * save it.
3629 */
3630 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
3631 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
3632
3633 reload_tss(vcpu);
3634
3635 x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
3636
3637 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3638 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3639 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3640 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3641
3642 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3643 kvm_before_interrupt(&svm->vcpu);
3644
3645 kvm_load_host_xsave_state(vcpu);
3646 stgi();
3647
3648 /* Any pending NMI will happen here */
3649
3650 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3651 kvm_after_interrupt(&svm->vcpu);
3652
3653 sync_cr8_to_lapic(vcpu);
3654
3655 svm->next_rip = 0;
3656 if (is_guest_mode(&svm->vcpu)) {
3657 sync_nested_vmcb_control(svm);
3658 svm->nested.nested_run_pending = 0;
3659 }
3660
3661 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3662 vmcb_mark_all_clean(svm->vmcb);
3663
3664 /* if exit due to PF check for async PF */
3665 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3666 svm->vcpu.arch.apf.host_apf_flags =
3667 kvm_read_and_reset_apf_flags();
3668
3669 if (npt_enabled) {
3670 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3671 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
3672 }
3673
3674 /*
3675 * We need to handle MC intercepts here before the vcpu has a chance to
3676 * change the physical cpu
3677 */
3678 if (unlikely(svm->vmcb->control.exit_code ==
3679 SVM_EXIT_EXCP_BASE + MC_VECTOR))
3680 svm_handle_mce(svm);
3681
3682 svm_complete_interrupts(svm);
3683
3684 if (is_guest_mode(vcpu))
3685 return EXIT_FASTPATH_NONE;
3686
3687 return svm_exit_handlers_fastpath(vcpu);
3688 }
3689
svm_load_mmu_pgd(struct kvm_vcpu * vcpu,unsigned long root,int root_level)3690 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
3691 int root_level)
3692 {
3693 struct vcpu_svm *svm = to_svm(vcpu);
3694 unsigned long cr3;
3695
3696 cr3 = __sme_set(root);
3697 if (npt_enabled) {
3698 svm->vmcb->control.nested_cr3 = cr3;
3699 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
3700
3701 /* Loading L2's CR3 is handled by enter_svm_guest_mode. */
3702 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3703 return;
3704 cr3 = vcpu->arch.cr3;
3705 }
3706
3707 svm->vmcb->save.cr3 = cr3;
3708 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
3709 }
3710
is_disabled(void)3711 static int is_disabled(void)
3712 {
3713 u64 vm_cr;
3714
3715 rdmsrl(MSR_VM_CR, vm_cr);
3716 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
3717 return 1;
3718
3719 return 0;
3720 }
3721
3722 static void
svm_patch_hypercall(struct kvm_vcpu * vcpu,unsigned char * hypercall)3723 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3724 {
3725 /*
3726 * Patch in the VMMCALL instruction:
3727 */
3728 hypercall[0] = 0x0f;
3729 hypercall[1] = 0x01;
3730 hypercall[2] = 0xd9;
3731 }
3732
svm_check_processor_compat(void)3733 static int __init svm_check_processor_compat(void)
3734 {
3735 return 0;
3736 }
3737
svm_cpu_has_accelerated_tpr(void)3738 static bool svm_cpu_has_accelerated_tpr(void)
3739 {
3740 return false;
3741 }
3742
svm_has_emulated_msr(u32 index)3743 static bool svm_has_emulated_msr(u32 index)
3744 {
3745 switch (index) {
3746 case MSR_IA32_MCG_EXT_CTL:
3747 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3748 return false;
3749 default:
3750 break;
3751 }
3752
3753 return true;
3754 }
3755
svm_get_mt_mask(struct kvm_vcpu * vcpu,gfn_t gfn,bool is_mmio)3756 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3757 {
3758 return 0;
3759 }
3760
svm_vcpu_after_set_cpuid(struct kvm_vcpu * vcpu)3761 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
3762 {
3763 struct vcpu_svm *svm = to_svm(vcpu);
3764 struct kvm_cpuid_entry2 *best;
3765
3766 vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
3767 boot_cpu_has(X86_FEATURE_XSAVE) &&
3768 boot_cpu_has(X86_FEATURE_XSAVES);
3769
3770 /* Update nrips enabled cache */
3771 svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
3772 guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
3773
3774 /* Check again if INVPCID interception if required */
3775 svm_check_invpcid(svm);
3776
3777 /* For sev guests, the memory encryption bit is not reserved in CR3. */
3778 if (sev_guest(vcpu->kvm)) {
3779 best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
3780 if (best)
3781 vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f));
3782 }
3783
3784 if (!kvm_vcpu_apicv_active(vcpu))
3785 return;
3786
3787 /*
3788 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
3789 * is exposed to the guest, disable AVIC.
3790 */
3791 if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
3792 kvm_request_apicv_update(vcpu->kvm, false,
3793 APICV_INHIBIT_REASON_X2APIC);
3794
3795 /*
3796 * Currently, AVIC does not work with nested virtualization.
3797 * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
3798 */
3799 if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
3800 kvm_request_apicv_update(vcpu->kvm, false,
3801 APICV_INHIBIT_REASON_NESTED);
3802 }
3803
svm_has_wbinvd_exit(void)3804 static bool svm_has_wbinvd_exit(void)
3805 {
3806 return true;
3807 }
3808
3809 #define PRE_EX(exit) { .exit_code = (exit), \
3810 .stage = X86_ICPT_PRE_EXCEPT, }
3811 #define POST_EX(exit) { .exit_code = (exit), \
3812 .stage = X86_ICPT_POST_EXCEPT, }
3813 #define POST_MEM(exit) { .exit_code = (exit), \
3814 .stage = X86_ICPT_POST_MEMACCESS, }
3815
3816 static const struct __x86_intercept {
3817 u32 exit_code;
3818 enum x86_intercept_stage stage;
3819 } x86_intercept_map[] = {
3820 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
3821 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
3822 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
3823 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
3824 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
3825 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
3826 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
3827 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
3828 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
3829 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
3830 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
3831 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
3832 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
3833 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
3834 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
3835 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
3836 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
3837 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
3838 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
3839 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
3840 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
3841 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
3842 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
3843 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
3844 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
3845 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
3846 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
3847 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
3848 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
3849 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
3850 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
3851 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
3852 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
3853 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
3854 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
3855 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
3856 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
3857 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
3858 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
3859 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
3860 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
3861 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
3862 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
3863 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
3864 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
3865 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
3866 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
3867 };
3868
3869 #undef PRE_EX
3870 #undef POST_EX
3871 #undef POST_MEM
3872
svm_check_intercept(struct kvm_vcpu * vcpu,struct x86_instruction_info * info,enum x86_intercept_stage stage,struct x86_exception * exception)3873 static int svm_check_intercept(struct kvm_vcpu *vcpu,
3874 struct x86_instruction_info *info,
3875 enum x86_intercept_stage stage,
3876 struct x86_exception *exception)
3877 {
3878 struct vcpu_svm *svm = to_svm(vcpu);
3879 int vmexit, ret = X86EMUL_CONTINUE;
3880 struct __x86_intercept icpt_info;
3881 struct vmcb *vmcb = svm->vmcb;
3882
3883 if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
3884 goto out;
3885
3886 icpt_info = x86_intercept_map[info->intercept];
3887
3888 if (stage != icpt_info.stage)
3889 goto out;
3890
3891 switch (icpt_info.exit_code) {
3892 case SVM_EXIT_READ_CR0:
3893 if (info->intercept == x86_intercept_cr_read)
3894 icpt_info.exit_code += info->modrm_reg;
3895 break;
3896 case SVM_EXIT_WRITE_CR0: {
3897 unsigned long cr0, val;
3898
3899 if (info->intercept == x86_intercept_cr_write)
3900 icpt_info.exit_code += info->modrm_reg;
3901
3902 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
3903 info->intercept == x86_intercept_clts)
3904 break;
3905
3906 if (!(vmcb_is_intercept(&svm->nested.ctl,
3907 INTERCEPT_SELECTIVE_CR0)))
3908 break;
3909
3910 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
3911 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
3912
3913 if (info->intercept == x86_intercept_lmsw) {
3914 cr0 &= 0xfUL;
3915 val &= 0xfUL;
3916 /* lmsw can't clear PE - catch this here */
3917 if (cr0 & X86_CR0_PE)
3918 val |= X86_CR0_PE;
3919 }
3920
3921 if (cr0 ^ val)
3922 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
3923
3924 break;
3925 }
3926 case SVM_EXIT_READ_DR0:
3927 case SVM_EXIT_WRITE_DR0:
3928 icpt_info.exit_code += info->modrm_reg;
3929 break;
3930 case SVM_EXIT_MSR:
3931 if (info->intercept == x86_intercept_wrmsr)
3932 vmcb->control.exit_info_1 = 1;
3933 else
3934 vmcb->control.exit_info_1 = 0;
3935 break;
3936 case SVM_EXIT_PAUSE:
3937 /*
3938 * We get this for NOP only, but pause
3939 * is rep not, check this here
3940 */
3941 if (info->rep_prefix != REPE_PREFIX)
3942 goto out;
3943 break;
3944 case SVM_EXIT_IOIO: {
3945 u64 exit_info;
3946 u32 bytes;
3947
3948 if (info->intercept == x86_intercept_in ||
3949 info->intercept == x86_intercept_ins) {
3950 exit_info = ((info->src_val & 0xffff) << 16) |
3951 SVM_IOIO_TYPE_MASK;
3952 bytes = info->dst_bytes;
3953 } else {
3954 exit_info = (info->dst_val & 0xffff) << 16;
3955 bytes = info->src_bytes;
3956 }
3957
3958 if (info->intercept == x86_intercept_outs ||
3959 info->intercept == x86_intercept_ins)
3960 exit_info |= SVM_IOIO_STR_MASK;
3961
3962 if (info->rep_prefix)
3963 exit_info |= SVM_IOIO_REP_MASK;
3964
3965 bytes = min(bytes, 4u);
3966
3967 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
3968
3969 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
3970
3971 vmcb->control.exit_info_1 = exit_info;
3972 vmcb->control.exit_info_2 = info->next_rip;
3973
3974 break;
3975 }
3976 default:
3977 break;
3978 }
3979
3980 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
3981 if (static_cpu_has(X86_FEATURE_NRIPS))
3982 vmcb->control.next_rip = info->next_rip;
3983 vmcb->control.exit_code = icpt_info.exit_code;
3984 vmexit = nested_svm_exit_handled(svm);
3985
3986 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
3987 : X86EMUL_CONTINUE;
3988
3989 out:
3990 return ret;
3991 }
3992
svm_handle_exit_irqoff(struct kvm_vcpu * vcpu)3993 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
3994 {
3995 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
3996 vcpu->arch.at_instruction_boundary = true;
3997 }
3998
svm_sched_in(struct kvm_vcpu * vcpu,int cpu)3999 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4000 {
4001 if (!kvm_pause_in_guest(vcpu->kvm))
4002 shrink_ple_window(vcpu);
4003 }
4004
svm_setup_mce(struct kvm_vcpu * vcpu)4005 static void svm_setup_mce(struct kvm_vcpu *vcpu)
4006 {
4007 /* [63:9] are reserved. */
4008 vcpu->arch.mcg_cap &= 0x1ff;
4009 }
4010
svm_smi_blocked(struct kvm_vcpu * vcpu)4011 bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4012 {
4013 struct vcpu_svm *svm = to_svm(vcpu);
4014
4015 /* Per APM Vol.2 15.22.2 "Response to SMI" */
4016 if (!gif_set(svm))
4017 return true;
4018
4019 return is_smm(vcpu);
4020 }
4021
svm_smi_allowed(struct kvm_vcpu * vcpu,bool for_injection)4022 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4023 {
4024 struct vcpu_svm *svm = to_svm(vcpu);
4025 if (svm->nested.nested_run_pending)
4026 return -EBUSY;
4027
4028 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
4029 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4030 return -EBUSY;
4031
4032 return !svm_smi_blocked(vcpu);
4033 }
4034
svm_pre_enter_smm(struct kvm_vcpu * vcpu,char * smstate)4035 static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
4036 {
4037 struct vcpu_svm *svm = to_svm(vcpu);
4038 int ret;
4039
4040 if (is_guest_mode(vcpu)) {
4041 /* FED8h - SVM Guest */
4042 put_smstate(u64, smstate, 0x7ed8, 1);
4043 /* FEE0h - SVM Guest VMCB Physical Address */
4044 put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
4045
4046 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4047 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4048 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4049
4050 ret = nested_svm_vmexit(svm);
4051 if (ret)
4052 return ret;
4053 }
4054 return 0;
4055 }
4056
svm_pre_leave_smm(struct kvm_vcpu * vcpu,const char * smstate)4057 static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
4058 {
4059 struct vcpu_svm *svm = to_svm(vcpu);
4060 struct kvm_host_map map;
4061 int ret = 0;
4062
4063 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
4064 u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
4065 u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
4066 u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
4067
4068 if (guest) {
4069 if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4070 return 1;
4071
4072 if (!(saved_efer & EFER_SVME))
4073 return 1;
4074
4075 if (kvm_vcpu_map(&svm->vcpu,
4076 gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
4077 return 1;
4078
4079 if (svm_allocate_nested(svm))
4080 return 1;
4081
4082 ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva);
4083 kvm_vcpu_unmap(&svm->vcpu, &map, true);
4084 }
4085 }
4086
4087 return ret;
4088 }
4089
enable_smi_window(struct kvm_vcpu * vcpu)4090 static void enable_smi_window(struct kvm_vcpu *vcpu)
4091 {
4092 struct vcpu_svm *svm = to_svm(vcpu);
4093
4094 if (!gif_set(svm)) {
4095 if (vgif_enabled(svm))
4096 svm_set_intercept(svm, INTERCEPT_STGI);
4097 /* STGI will cause a vm exit */
4098 } else {
4099 /* We must be in SMM; RSM will cause a vmexit anyway. */
4100 }
4101 }
4102
svm_can_emulate_instruction(struct kvm_vcpu * vcpu,void * insn,int insn_len)4103 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
4104 {
4105 bool smep, smap, is_user;
4106 unsigned long cr4;
4107
4108 /*
4109 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4110 *
4111 * Errata:
4112 * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
4113 * possible that CPU microcode implementing DecodeAssist will fail
4114 * to read bytes of instruction which caused #NPF. In this case,
4115 * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
4116 * return 0 instead of the correct guest instruction bytes.
4117 *
4118 * This happens because CPU microcode reading instruction bytes
4119 * uses a special opcode which attempts to read data using CPL=0
4120 * priviledges. The microcode reads CS:RIP and if it hits a SMAP
4121 * fault, it gives up and returns no instruction bytes.
4122 *
4123 * Detection:
4124 * We reach here in case CPU supports DecodeAssist, raised #NPF and
4125 * returned 0 in GuestIntrBytes field of the VMCB.
4126 * First, errata can only be triggered in case vCPU CR4.SMAP=1.
4127 * Second, if vCPU CR4.SMEP=1, errata could only be triggered
4128 * in case vCPU CPL==3 (Because otherwise guest would have triggered
4129 * a SMEP fault instead of #NPF).
4130 * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
4131 * As most guests enable SMAP if they have also enabled SMEP, use above
4132 * logic in order to attempt minimize false-positive of detecting errata
4133 * while still preserving all cases semantic correctness.
4134 *
4135 * Workaround:
4136 * To determine what instruction the guest was executing, the hypervisor
4137 * will have to decode the instruction at the instruction pointer.
4138 *
4139 * In non SEV guest, hypervisor will be able to read the guest
4140 * memory to decode the instruction pointer when insn_len is zero
4141 * so we return true to indicate that decoding is possible.
4142 *
4143 * But in the SEV guest, the guest memory is encrypted with the
4144 * guest specific key and hypervisor will not be able to decode the
4145 * instruction pointer so we will not able to workaround it. Lets
4146 * print the error and request to kill the guest.
4147 */
4148 if (likely(!insn || insn_len))
4149 return true;
4150
4151 cr4 = kvm_read_cr4(vcpu);
4152 smep = cr4 & X86_CR4_SMEP;
4153 smap = cr4 & X86_CR4_SMAP;
4154 is_user = svm_get_cpl(vcpu) == 3;
4155 if (smap && (!smep || is_user)) {
4156 if (!sev_guest(vcpu->kvm))
4157 return true;
4158
4159 pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
4160 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4161 }
4162
4163 return false;
4164 }
4165
svm_apic_init_signal_blocked(struct kvm_vcpu * vcpu)4166 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4167 {
4168 struct vcpu_svm *svm = to_svm(vcpu);
4169
4170 /*
4171 * TODO: Last condition latch INIT signals on vCPU when
4172 * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
4173 * To properly emulate the INIT intercept,
4174 * svm_check_nested_events() should call nested_svm_vmexit()
4175 * if an INIT signal is pending.
4176 */
4177 return !gif_set(svm) ||
4178 (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
4179 }
4180
svm_vm_destroy(struct kvm * kvm)4181 static void svm_vm_destroy(struct kvm *kvm)
4182 {
4183 avic_vm_destroy(kvm);
4184 sev_vm_destroy(kvm);
4185 }
4186
svm_vm_init(struct kvm * kvm)4187 static int svm_vm_init(struct kvm *kvm)
4188 {
4189 if (!pause_filter_count || !pause_filter_thresh)
4190 kvm->arch.pause_in_guest = true;
4191
4192 if (avic) {
4193 int ret = avic_vm_init(kvm);
4194 if (ret)
4195 return ret;
4196 }
4197
4198 kvm_apicv_init(kvm, avic);
4199 return 0;
4200 }
4201
4202 static struct kvm_x86_ops svm_x86_ops __initdata = {
4203 .hardware_unsetup = svm_hardware_teardown,
4204 .hardware_enable = svm_hardware_enable,
4205 .hardware_disable = svm_hardware_disable,
4206 .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
4207 .has_emulated_msr = svm_has_emulated_msr,
4208
4209 .vcpu_create = svm_create_vcpu,
4210 .vcpu_free = svm_free_vcpu,
4211 .vcpu_reset = svm_vcpu_reset,
4212
4213 .vm_size = sizeof(struct kvm_svm),
4214 .vm_init = svm_vm_init,
4215 .vm_destroy = svm_vm_destroy,
4216
4217 .prepare_guest_switch = svm_prepare_guest_switch,
4218 .vcpu_load = svm_vcpu_load,
4219 .vcpu_put = svm_vcpu_put,
4220 .vcpu_blocking = svm_vcpu_blocking,
4221 .vcpu_unblocking = svm_vcpu_unblocking,
4222
4223 .update_exception_bitmap = update_exception_bitmap,
4224 .get_msr_feature = svm_get_msr_feature,
4225 .get_msr = svm_get_msr,
4226 .set_msr = svm_set_msr,
4227 .get_segment_base = svm_get_segment_base,
4228 .get_segment = svm_get_segment,
4229 .set_segment = svm_set_segment,
4230 .get_cpl = svm_get_cpl,
4231 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
4232 .set_cr0 = svm_set_cr0,
4233 .set_cr4 = svm_set_cr4,
4234 .set_efer = svm_set_efer,
4235 .get_idt = svm_get_idt,
4236 .set_idt = svm_set_idt,
4237 .get_gdt = svm_get_gdt,
4238 .set_gdt = svm_set_gdt,
4239 .set_dr7 = svm_set_dr7,
4240 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4241 .cache_reg = svm_cache_reg,
4242 .get_rflags = svm_get_rflags,
4243 .set_rflags = svm_set_rflags,
4244
4245 .tlb_flush_all = svm_flush_tlb,
4246 .tlb_flush_current = svm_flush_tlb,
4247 .tlb_flush_gva = svm_flush_tlb_gva,
4248 .tlb_flush_guest = svm_flush_tlb,
4249
4250 .run = svm_vcpu_run,
4251 .handle_exit = handle_exit,
4252 .skip_emulated_instruction = skip_emulated_instruction,
4253 .update_emulated_instruction = NULL,
4254 .set_interrupt_shadow = svm_set_interrupt_shadow,
4255 .get_interrupt_shadow = svm_get_interrupt_shadow,
4256 .patch_hypercall = svm_patch_hypercall,
4257 .set_irq = svm_set_irq,
4258 .set_nmi = svm_inject_nmi,
4259 .queue_exception = svm_queue_exception,
4260 .cancel_injection = svm_cancel_injection,
4261 .interrupt_allowed = svm_interrupt_allowed,
4262 .nmi_allowed = svm_nmi_allowed,
4263 .get_nmi_mask = svm_get_nmi_mask,
4264 .set_nmi_mask = svm_set_nmi_mask,
4265 .enable_nmi_window = enable_nmi_window,
4266 .enable_irq_window = enable_irq_window,
4267 .update_cr8_intercept = update_cr8_intercept,
4268 .set_virtual_apic_mode = svm_set_virtual_apic_mode,
4269 .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
4270 .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
4271 .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
4272 .load_eoi_exitmap = svm_load_eoi_exitmap,
4273 .hwapic_irr_update = svm_hwapic_irr_update,
4274 .hwapic_isr_update = svm_hwapic_isr_update,
4275 .sync_pir_to_irr = kvm_lapic_find_highest_irr,
4276 .apicv_post_state_restore = avic_post_state_restore,
4277
4278 .set_tss_addr = svm_set_tss_addr,
4279 .set_identity_map_addr = svm_set_identity_map_addr,
4280 .get_mt_mask = svm_get_mt_mask,
4281
4282 .get_exit_info = svm_get_exit_info,
4283
4284 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4285
4286 .has_wbinvd_exit = svm_has_wbinvd_exit,
4287
4288 .write_l1_tsc_offset = svm_write_l1_tsc_offset,
4289
4290 .load_mmu_pgd = svm_load_mmu_pgd,
4291
4292 .check_intercept = svm_check_intercept,
4293 .handle_exit_irqoff = svm_handle_exit_irqoff,
4294
4295 .request_immediate_exit = __kvm_request_immediate_exit,
4296
4297 .sched_in = svm_sched_in,
4298
4299 .pmu_ops = &amd_pmu_ops,
4300 .nested_ops = &svm_nested_ops,
4301
4302 .deliver_posted_interrupt = svm_deliver_avic_intr,
4303 .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
4304 .update_pi_irte = svm_update_pi_irte,
4305 .setup_mce = svm_setup_mce,
4306
4307 .smi_allowed = svm_smi_allowed,
4308 .pre_enter_smm = svm_pre_enter_smm,
4309 .pre_leave_smm = svm_pre_leave_smm,
4310 .enable_smi_window = enable_smi_window,
4311
4312 .mem_enc_op = svm_mem_enc_op,
4313 .mem_enc_reg_region = svm_register_enc_region,
4314 .mem_enc_unreg_region = svm_unregister_enc_region,
4315
4316 .can_emulate_instruction = svm_can_emulate_instruction,
4317
4318 .apic_init_signal_blocked = svm_apic_init_signal_blocked,
4319
4320 .msr_filter_changed = svm_msr_filter_changed,
4321 };
4322
4323 static struct kvm_x86_init_ops svm_init_ops __initdata = {
4324 .cpu_has_kvm_support = has_svm,
4325 .disabled_by_bios = is_disabled,
4326 .hardware_setup = svm_hardware_setup,
4327 .check_processor_compatibility = svm_check_processor_compat,
4328
4329 .runtime_ops = &svm_x86_ops,
4330 };
4331
svm_init(void)4332 static int __init svm_init(void)
4333 {
4334 __unused_size_checks();
4335
4336 return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
4337 __alignof__(struct vcpu_svm), THIS_MODULE);
4338 }
4339
svm_exit(void)4340 static void __exit svm_exit(void)
4341 {
4342 kvm_exit();
4343 }
4344
4345 module_init(svm_init)
4346 module_exit(svm_exit)
4347