1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5 */
6
7 #include <linux/bug.h>
8 #include <linux/cpu_pm.h>
9 #include <linux/errno.h>
10 #include <linux/err.h>
11 #include <linux/kvm_host.h>
12 #include <linux/list.h>
13 #include <linux/module.h>
14 #include <linux/vmalloc.h>
15 #include <linux/fs.h>
16 #include <linux/mman.h>
17 #include <linux/sched.h>
18 #include <linux/kvm.h>
19 #include <linux/kvm_irqfd.h>
20 #include <linux/irqbypass.h>
21 #include <linux/sched/stat.h>
22 #include <trace/events/kvm.h>
23
24 #define CREATE_TRACE_POINTS
25 #include "trace_arm.h"
26
27 #include <linux/uaccess.h>
28 #include <asm/ptrace.h>
29 #include <asm/mman.h>
30 #include <asm/tlbflush.h>
31 #include <asm/cacheflush.h>
32 #include <asm/cpufeature.h>
33 #include <asm/virt.h>
34 #include <asm/kvm_arm.h>
35 #include <asm/kvm_asm.h>
36 #include <asm/kvm_mmu.h>
37 #include <asm/kvm_emulate.h>
38 #include <asm/kvm_coproc.h>
39 #include <asm/sections.h>
40
41 #include <kvm/arm_hypercalls.h>
42 #include <kvm/arm_pmu.h>
43 #include <kvm/arm_psci.h>
44
45 #ifdef REQUIRES_VIRT
46 __asm__(".arch_extension virt");
47 #endif
48
49 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
50
51 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
52 unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
53
54 /* The VMID used in the VTTBR */
55 static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
56 static u32 kvm_next_vmid;
57 static DEFINE_SPINLOCK(kvm_vmid_lock);
58
59 static bool vgic_present;
60
61 static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
62 DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
63
kvm_arch_vcpu_should_kick(struct kvm_vcpu * vcpu)64 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
65 {
66 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
67 }
68
kvm_arch_hardware_setup(void * opaque)69 int kvm_arch_hardware_setup(void *opaque)
70 {
71 return 0;
72 }
73
kvm_arch_check_processor_compat(void * opaque)74 int kvm_arch_check_processor_compat(void *opaque)
75 {
76 return 0;
77 }
78
kvm_vm_ioctl_enable_cap(struct kvm * kvm,struct kvm_enable_cap * cap)79 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
80 struct kvm_enable_cap *cap)
81 {
82 int r;
83
84 if (cap->flags)
85 return -EINVAL;
86
87 switch (cap->cap) {
88 case KVM_CAP_ARM_NISV_TO_USER:
89 r = 0;
90 kvm->arch.return_nisv_io_abort_to_user = true;
91 break;
92 default:
93 r = -EINVAL;
94 break;
95 }
96
97 return r;
98 }
99
kvm_arm_default_max_vcpus(void)100 static int kvm_arm_default_max_vcpus(void)
101 {
102 return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
103 }
104
set_default_csv2(struct kvm * kvm)105 static void set_default_csv2(struct kvm *kvm)
106 {
107 /*
108 * The default is to expose CSV2 == 1 if the HW isn't affected.
109 * Although this is a per-CPU feature, we make it global because
110 * asymmetric systems are just a nuisance.
111 *
112 * Userspace can override this as long as it doesn't promise
113 * the impossible.
114 */
115 if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
116 kvm->arch.pfr0_csv2 = 1;
117 }
118
119 /**
120 * kvm_arch_init_vm - initializes a VM data structure
121 * @kvm: pointer to the KVM struct
122 */
kvm_arch_init_vm(struct kvm * kvm,unsigned long type)123 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
124 {
125 int ret;
126
127 ret = kvm_arm_setup_stage2(kvm, type);
128 if (ret)
129 return ret;
130
131 ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu);
132 if (ret)
133 return ret;
134
135 ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
136 if (ret)
137 goto out_free_stage2_pgd;
138
139 kvm_vgic_early_init(kvm);
140
141 /* The maximum number of VCPUs is limited by the host's GIC model */
142 kvm->arch.max_vcpus = kvm_arm_default_max_vcpus();
143
144 set_default_csv2(kvm);
145
146 return ret;
147 out_free_stage2_pgd:
148 kvm_free_stage2_pgd(&kvm->arch.mmu);
149 return ret;
150 }
151
kvm_arch_vcpu_fault(struct kvm_vcpu * vcpu,struct vm_fault * vmf)152 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
153 {
154 return VM_FAULT_SIGBUS;
155 }
156
157
158 /**
159 * kvm_arch_destroy_vm - destroy the VM data structure
160 * @kvm: pointer to the KVM struct
161 */
kvm_arch_destroy_vm(struct kvm * kvm)162 void kvm_arch_destroy_vm(struct kvm *kvm)
163 {
164 int i;
165
166 bitmap_free(kvm->arch.pmu_filter);
167
168 kvm_vgic_destroy(kvm);
169
170 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
171 if (kvm->vcpus[i]) {
172 kvm_vcpu_destroy(kvm->vcpus[i]);
173 kvm->vcpus[i] = NULL;
174 }
175 }
176 atomic_set(&kvm->online_vcpus, 0);
177 }
178
kvm_vm_ioctl_check_extension(struct kvm * kvm,long ext)179 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
180 {
181 int r;
182 switch (ext) {
183 case KVM_CAP_IRQCHIP:
184 r = vgic_present;
185 break;
186 case KVM_CAP_IOEVENTFD:
187 case KVM_CAP_DEVICE_CTRL:
188 case KVM_CAP_USER_MEMORY:
189 case KVM_CAP_SYNC_MMU:
190 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
191 case KVM_CAP_ONE_REG:
192 case KVM_CAP_ARM_PSCI:
193 case KVM_CAP_ARM_PSCI_0_2:
194 case KVM_CAP_READONLY_MEM:
195 case KVM_CAP_MP_STATE:
196 case KVM_CAP_IMMEDIATE_EXIT:
197 case KVM_CAP_VCPU_EVENTS:
198 case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
199 case KVM_CAP_ARM_NISV_TO_USER:
200 case KVM_CAP_ARM_INJECT_EXT_DABT:
201 r = 1;
202 break;
203 case KVM_CAP_ARM_SET_DEVICE_ADDR:
204 r = 1;
205 break;
206 case KVM_CAP_NR_VCPUS:
207 r = num_online_cpus();
208 break;
209 case KVM_CAP_MAX_VCPUS:
210 case KVM_CAP_MAX_VCPU_ID:
211 if (kvm)
212 r = kvm->arch.max_vcpus;
213 else
214 r = kvm_arm_default_max_vcpus();
215 break;
216 case KVM_CAP_MSI_DEVID:
217 if (!kvm)
218 r = -EINVAL;
219 else
220 r = kvm->arch.vgic.msis_require_devid;
221 break;
222 case KVM_CAP_ARM_USER_IRQ:
223 /*
224 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
225 * (bump this number if adding more devices)
226 */
227 r = 1;
228 break;
229 case KVM_CAP_STEAL_TIME:
230 r = kvm_arm_pvtime_supported();
231 break;
232 default:
233 r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
234 break;
235 }
236 return r;
237 }
238
kvm_arch_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)239 long kvm_arch_dev_ioctl(struct file *filp,
240 unsigned int ioctl, unsigned long arg)
241 {
242 return -EINVAL;
243 }
244
kvm_arch_alloc_vm(void)245 struct kvm *kvm_arch_alloc_vm(void)
246 {
247 if (!has_vhe())
248 return kzalloc(sizeof(struct kvm), GFP_KERNEL);
249
250 return vzalloc(sizeof(struct kvm));
251 }
252
kvm_arch_free_vm(struct kvm * kvm)253 void kvm_arch_free_vm(struct kvm *kvm)
254 {
255 if (!has_vhe())
256 kfree(kvm);
257 else
258 vfree(kvm);
259 }
260
kvm_arch_vcpu_precreate(struct kvm * kvm,unsigned int id)261 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
262 {
263 if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
264 return -EBUSY;
265
266 if (id >= kvm->arch.max_vcpus)
267 return -EINVAL;
268
269 return 0;
270 }
271
kvm_arch_vcpu_create(struct kvm_vcpu * vcpu)272 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
273 {
274 int err;
275
276 /* Force users to call KVM_ARM_VCPU_INIT */
277 vcpu->arch.target = -1;
278 bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
279
280 vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
281
282 /* Set up the timer */
283 kvm_timer_vcpu_init(vcpu);
284
285 kvm_pmu_vcpu_init(vcpu);
286
287 kvm_arm_reset_debug_ptr(vcpu);
288
289 kvm_arm_pvtime_vcpu_init(&vcpu->arch);
290
291 vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
292
293 err = kvm_vgic_vcpu_init(vcpu);
294 if (err)
295 return err;
296
297 return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
298 }
299
kvm_arch_vcpu_postcreate(struct kvm_vcpu * vcpu)300 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
301 {
302 }
303
kvm_arch_vcpu_destroy(struct kvm_vcpu * vcpu)304 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
305 {
306 if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
307 static_branch_dec(&userspace_irqchip_in_use);
308
309 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
310 kvm_timer_vcpu_terminate(vcpu);
311 kvm_pmu_vcpu_destroy(vcpu);
312
313 kvm_arm_vcpu_destroy(vcpu);
314 }
315
kvm_cpu_has_pending_timer(struct kvm_vcpu * vcpu)316 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
317 {
318 return kvm_timer_is_pending(vcpu);
319 }
320
kvm_arch_vcpu_blocking(struct kvm_vcpu * vcpu)321 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
322 {
323 /*
324 * If we're about to block (most likely because we've just hit a
325 * WFI), we need to sync back the state of the GIC CPU interface
326 * so that we have the latest PMR and group enables. This ensures
327 * that kvm_arch_vcpu_runnable has up-to-date data to decide
328 * whether we have pending interrupts.
329 *
330 * For the same reason, we want to tell GICv4 that we need
331 * doorbells to be signalled, should an interrupt become pending.
332 */
333 preempt_disable();
334 kvm_vgic_vmcr_sync(vcpu);
335 vgic_v4_put(vcpu, true);
336 preempt_enable();
337 }
338
kvm_arch_vcpu_unblocking(struct kvm_vcpu * vcpu)339 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
340 {
341 preempt_disable();
342 vgic_v4_load(vcpu);
343 preempt_enable();
344 }
345
kvm_arch_vcpu_load(struct kvm_vcpu * vcpu,int cpu)346 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
347 {
348 struct kvm_s2_mmu *mmu;
349 int *last_ran;
350
351 mmu = vcpu->arch.hw_mmu;
352 last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
353
354 /*
355 * We guarantee that both TLBs and I-cache are private to each
356 * vcpu. If detecting that a vcpu from the same VM has
357 * previously run on the same physical CPU, call into the
358 * hypervisor code to nuke the relevant contexts.
359 *
360 * We might get preempted before the vCPU actually runs, but
361 * over-invalidation doesn't affect correctness.
362 */
363 if (*last_ran != vcpu->vcpu_id) {
364 kvm_call_hyp(__kvm_flush_cpu_context, mmu);
365 *last_ran = vcpu->vcpu_id;
366 }
367
368 vcpu->cpu = cpu;
369
370 kvm_vgic_load(vcpu);
371 kvm_timer_vcpu_load(vcpu);
372 if (has_vhe())
373 kvm_vcpu_load_sysregs_vhe(vcpu);
374 kvm_arch_vcpu_load_fp(vcpu);
375 kvm_vcpu_pmu_restore_guest(vcpu);
376 if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
377 kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
378
379 if (single_task_running())
380 vcpu_clear_wfx_traps(vcpu);
381 else
382 vcpu_set_wfx_traps(vcpu);
383
384 if (vcpu_has_ptrauth(vcpu))
385 vcpu_ptrauth_disable(vcpu);
386 }
387
kvm_arch_vcpu_put(struct kvm_vcpu * vcpu)388 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
389 {
390 kvm_arch_vcpu_put_fp(vcpu);
391 if (has_vhe())
392 kvm_vcpu_put_sysregs_vhe(vcpu);
393 kvm_timer_vcpu_put(vcpu);
394 kvm_vgic_put(vcpu);
395 kvm_vcpu_pmu_restore_host(vcpu);
396
397 vcpu->cpu = -1;
398 }
399
vcpu_power_off(struct kvm_vcpu * vcpu)400 static void vcpu_power_off(struct kvm_vcpu *vcpu)
401 {
402 vcpu->arch.power_off = true;
403 kvm_make_request(KVM_REQ_SLEEP, vcpu);
404 kvm_vcpu_kick(vcpu);
405 }
406
kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu * vcpu,struct kvm_mp_state * mp_state)407 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
408 struct kvm_mp_state *mp_state)
409 {
410 if (vcpu->arch.power_off)
411 mp_state->mp_state = KVM_MP_STATE_STOPPED;
412 else
413 mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
414
415 return 0;
416 }
417
kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu * vcpu,struct kvm_mp_state * mp_state)418 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
419 struct kvm_mp_state *mp_state)
420 {
421 int ret = 0;
422
423 switch (mp_state->mp_state) {
424 case KVM_MP_STATE_RUNNABLE:
425 vcpu->arch.power_off = false;
426 break;
427 case KVM_MP_STATE_STOPPED:
428 vcpu_power_off(vcpu);
429 break;
430 default:
431 ret = -EINVAL;
432 }
433
434 return ret;
435 }
436
437 /**
438 * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
439 * @v: The VCPU pointer
440 *
441 * If the guest CPU is not waiting for interrupts or an interrupt line is
442 * asserted, the CPU is by definition runnable.
443 */
kvm_arch_vcpu_runnable(struct kvm_vcpu * v)444 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
445 {
446 bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
447 return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
448 && !v->arch.power_off && !v->arch.pause);
449 }
450
kvm_arch_vcpu_in_kernel(struct kvm_vcpu * vcpu)451 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
452 {
453 return vcpu_mode_priv(vcpu);
454 }
455
456 /* Just ensure a guest exit from a particular CPU */
exit_vm_noop(void * info)457 static void exit_vm_noop(void *info)
458 {
459 }
460
force_vm_exit(const cpumask_t * mask)461 void force_vm_exit(const cpumask_t *mask)
462 {
463 preempt_disable();
464 smp_call_function_many(mask, exit_vm_noop, NULL, true);
465 preempt_enable();
466 }
467
468 /**
469 * need_new_vmid_gen - check that the VMID is still valid
470 * @vmid: The VMID to check
471 *
472 * return true if there is a new generation of VMIDs being used
473 *
474 * The hardware supports a limited set of values with the value zero reserved
475 * for the host, so we check if an assigned value belongs to a previous
476 * generation, which requires us to assign a new value. If we're the first to
477 * use a VMID for the new generation, we must flush necessary caches and TLBs
478 * on all CPUs.
479 */
need_new_vmid_gen(struct kvm_vmid * vmid)480 static bool need_new_vmid_gen(struct kvm_vmid *vmid)
481 {
482 u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
483 smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
484 return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen);
485 }
486
487 /**
488 * update_vmid - Update the vmid with a valid VMID for the current generation
489 * @vmid: The stage-2 VMID information struct
490 */
update_vmid(struct kvm_vmid * vmid)491 static void update_vmid(struct kvm_vmid *vmid)
492 {
493 if (!need_new_vmid_gen(vmid))
494 return;
495
496 spin_lock(&kvm_vmid_lock);
497
498 /*
499 * We need to re-check the vmid_gen here to ensure that if another vcpu
500 * already allocated a valid vmid for this vm, then this vcpu should
501 * use the same vmid.
502 */
503 if (!need_new_vmid_gen(vmid)) {
504 spin_unlock(&kvm_vmid_lock);
505 return;
506 }
507
508 /* First user of a new VMID generation? */
509 if (unlikely(kvm_next_vmid == 0)) {
510 atomic64_inc(&kvm_vmid_gen);
511 kvm_next_vmid = 1;
512
513 /*
514 * On SMP we know no other CPUs can use this CPU's or each
515 * other's VMID after force_vm_exit returns since the
516 * kvm_vmid_lock blocks them from reentry to the guest.
517 */
518 force_vm_exit(cpu_all_mask);
519 /*
520 * Now broadcast TLB + ICACHE invalidation over the inner
521 * shareable domain to make sure all data structures are
522 * clean.
523 */
524 kvm_call_hyp(__kvm_flush_vm_context);
525 }
526
527 vmid->vmid = kvm_next_vmid;
528 kvm_next_vmid++;
529 kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
530
531 smp_wmb();
532 WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen));
533
534 spin_unlock(&kvm_vmid_lock);
535 }
536
kvm_vcpu_first_run_init(struct kvm_vcpu * vcpu)537 static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
538 {
539 struct kvm *kvm = vcpu->kvm;
540 int ret = 0;
541
542 if (likely(vcpu->arch.has_run_once))
543 return 0;
544
545 if (!kvm_arm_vcpu_is_finalized(vcpu))
546 return -EPERM;
547
548 vcpu->arch.has_run_once = true;
549
550 kvm_arm_vcpu_init_debug(vcpu);
551
552 if (likely(irqchip_in_kernel(kvm))) {
553 /*
554 * Map the VGIC hardware resources before running a vcpu the
555 * first time on this VM.
556 */
557 if (unlikely(!vgic_ready(kvm))) {
558 ret = kvm_vgic_map_resources(kvm);
559 if (ret)
560 return ret;
561 }
562 } else {
563 /*
564 * Tell the rest of the code that there are userspace irqchip
565 * VMs in the wild.
566 */
567 static_branch_inc(&userspace_irqchip_in_use);
568 }
569
570 ret = kvm_timer_enable(vcpu);
571 if (ret)
572 return ret;
573
574 ret = kvm_arm_pmu_v3_enable(vcpu);
575
576 return ret;
577 }
578
kvm_arch_intc_initialized(struct kvm * kvm)579 bool kvm_arch_intc_initialized(struct kvm *kvm)
580 {
581 return vgic_initialized(kvm);
582 }
583
kvm_arm_halt_guest(struct kvm * kvm)584 void kvm_arm_halt_guest(struct kvm *kvm)
585 {
586 int i;
587 struct kvm_vcpu *vcpu;
588
589 kvm_for_each_vcpu(i, vcpu, kvm)
590 vcpu->arch.pause = true;
591 kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
592 }
593
kvm_arm_resume_guest(struct kvm * kvm)594 void kvm_arm_resume_guest(struct kvm *kvm)
595 {
596 int i;
597 struct kvm_vcpu *vcpu;
598
599 kvm_for_each_vcpu(i, vcpu, kvm) {
600 vcpu->arch.pause = false;
601 rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu));
602 }
603 }
604
vcpu_req_sleep(struct kvm_vcpu * vcpu)605 static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
606 {
607 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
608
609 rcuwait_wait_event(wait,
610 (!vcpu->arch.power_off) &&(!vcpu->arch.pause),
611 TASK_INTERRUPTIBLE);
612
613 if (vcpu->arch.power_off || vcpu->arch.pause) {
614 /* Awaken to handle a signal, request we sleep again later. */
615 kvm_make_request(KVM_REQ_SLEEP, vcpu);
616 }
617
618 /*
619 * Make sure we will observe a potential reset request if we've
620 * observed a change to the power state. Pairs with the smp_wmb() in
621 * kvm_psci_vcpu_on().
622 */
623 smp_rmb();
624 }
625
kvm_vcpu_initialized(struct kvm_vcpu * vcpu)626 static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
627 {
628 return vcpu->arch.target >= 0;
629 }
630
check_vcpu_requests(struct kvm_vcpu * vcpu)631 static void check_vcpu_requests(struct kvm_vcpu *vcpu)
632 {
633 if (kvm_request_pending(vcpu)) {
634 if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
635 vcpu_req_sleep(vcpu);
636
637 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
638 kvm_reset_vcpu(vcpu);
639
640 /*
641 * Clear IRQ_PENDING requests that were made to guarantee
642 * that a VCPU sees new virtual interrupts.
643 */
644 kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
645
646 if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
647 kvm_update_stolen_time(vcpu);
648
649 if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
650 /* The distributor enable bits were changed */
651 preempt_disable();
652 vgic_v4_put(vcpu, false);
653 vgic_v4_load(vcpu);
654 preempt_enable();
655 }
656 }
657 }
658
659 /**
660 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
661 * @vcpu: The VCPU pointer
662 *
663 * This function is called through the VCPU_RUN ioctl called from user space. It
664 * will execute VM code in a loop until the time slice for the process is used
665 * or some emulation is needed from user space in which case the function will
666 * return with return value 0 and with the kvm_run structure filled in with the
667 * required data for the requested emulation.
668 */
kvm_arch_vcpu_ioctl_run(struct kvm_vcpu * vcpu)669 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
670 {
671 struct kvm_run *run = vcpu->run;
672 int ret;
673
674 if (unlikely(!kvm_vcpu_initialized(vcpu)))
675 return -ENOEXEC;
676
677 ret = kvm_vcpu_first_run_init(vcpu);
678 if (ret)
679 return ret;
680
681 if (run->exit_reason == KVM_EXIT_MMIO) {
682 ret = kvm_handle_mmio_return(vcpu);
683 if (ret)
684 return ret;
685 }
686
687 if (run->immediate_exit)
688 return -EINTR;
689
690 vcpu_load(vcpu);
691
692 kvm_sigset_activate(vcpu);
693
694 ret = 1;
695 run->exit_reason = KVM_EXIT_UNKNOWN;
696 while (ret > 0) {
697 /*
698 * Check conditions before entering the guest
699 */
700 cond_resched();
701
702 update_vmid(&vcpu->arch.hw_mmu->vmid);
703
704 check_vcpu_requests(vcpu);
705
706 /*
707 * Preparing the interrupts to be injected also
708 * involves poking the GIC, which must be done in a
709 * non-preemptible context.
710 */
711 preempt_disable();
712
713 kvm_pmu_flush_hwstate(vcpu);
714
715 local_irq_disable();
716
717 kvm_vgic_flush_hwstate(vcpu);
718
719 /*
720 * Exit if we have a signal pending so that we can deliver the
721 * signal to user space.
722 */
723 if (signal_pending(current)) {
724 ret = -EINTR;
725 run->exit_reason = KVM_EXIT_INTR;
726 }
727
728 /*
729 * If we're using a userspace irqchip, then check if we need
730 * to tell a userspace irqchip about timer or PMU level
731 * changes and if so, exit to userspace (the actual level
732 * state gets updated in kvm_timer_update_run and
733 * kvm_pmu_update_run below).
734 */
735 if (static_branch_unlikely(&userspace_irqchip_in_use)) {
736 if (kvm_timer_should_notify_user(vcpu) ||
737 kvm_pmu_should_notify_user(vcpu)) {
738 ret = -EINTR;
739 run->exit_reason = KVM_EXIT_INTR;
740 }
741 }
742
743 /*
744 * Ensure we set mode to IN_GUEST_MODE after we disable
745 * interrupts and before the final VCPU requests check.
746 * See the comment in kvm_vcpu_exiting_guest_mode() and
747 * Documentation/virt/kvm/vcpu-requests.rst
748 */
749 smp_store_mb(vcpu->mode, IN_GUEST_MODE);
750
751 if (ret <= 0 || need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) ||
752 kvm_request_pending(vcpu)) {
753 vcpu->mode = OUTSIDE_GUEST_MODE;
754 isb(); /* Ensure work in x_flush_hwstate is committed */
755 kvm_pmu_sync_hwstate(vcpu);
756 if (static_branch_unlikely(&userspace_irqchip_in_use))
757 kvm_timer_sync_user(vcpu);
758 kvm_vgic_sync_hwstate(vcpu);
759 local_irq_enable();
760 preempt_enable();
761 continue;
762 }
763
764 kvm_arm_setup_debug(vcpu);
765
766 /**************************************************************
767 * Enter the guest
768 */
769 trace_kvm_entry(*vcpu_pc(vcpu));
770 guest_enter_irqoff();
771
772 ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
773
774 vcpu->mode = OUTSIDE_GUEST_MODE;
775 vcpu->stat.exits++;
776 /*
777 * Back from guest
778 *************************************************************/
779
780 kvm_arm_clear_debug(vcpu);
781
782 /*
783 * We must sync the PMU state before the vgic state so
784 * that the vgic can properly sample the updated state of the
785 * interrupt line.
786 */
787 kvm_pmu_sync_hwstate(vcpu);
788
789 /*
790 * Sync the vgic state before syncing the timer state because
791 * the timer code needs to know if the virtual timer
792 * interrupts are active.
793 */
794 kvm_vgic_sync_hwstate(vcpu);
795
796 /*
797 * Sync the timer hardware state before enabling interrupts as
798 * we don't want vtimer interrupts to race with syncing the
799 * timer virtual interrupt state.
800 */
801 if (static_branch_unlikely(&userspace_irqchip_in_use))
802 kvm_timer_sync_user(vcpu);
803
804 kvm_arch_vcpu_ctxsync_fp(vcpu);
805
806 /*
807 * We may have taken a host interrupt in HYP mode (ie
808 * while executing the guest). This interrupt is still
809 * pending, as we haven't serviced it yet!
810 *
811 * We're now back in SVC mode, with interrupts
812 * disabled. Enabling the interrupts now will have
813 * the effect of taking the interrupt again, in SVC
814 * mode this time.
815 */
816 local_irq_enable();
817
818 /*
819 * We do local_irq_enable() before calling guest_exit() so
820 * that if a timer interrupt hits while running the guest we
821 * account that tick as being spent in the guest. We enable
822 * preemption after calling guest_exit() so that if we get
823 * preempted we make sure ticks after that is not counted as
824 * guest time.
825 */
826 guest_exit();
827 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
828
829 /* Exit types that need handling before we can be preempted */
830 handle_exit_early(vcpu, ret);
831
832 preempt_enable();
833
834 /*
835 * The ARMv8 architecture doesn't give the hypervisor
836 * a mechanism to prevent a guest from dropping to AArch32 EL0
837 * if implemented by the CPU. If we spot the guest in such
838 * state and that we decided it wasn't supposed to do so (like
839 * with the asymmetric AArch32 case), return to userspace with
840 * a fatal error.
841 */
842 if (!system_supports_32bit_el0() && vcpu_mode_is_32bit(vcpu)) {
843 /*
844 * As we have caught the guest red-handed, decide that
845 * it isn't fit for purpose anymore by making the vcpu
846 * invalid. The VMM can try and fix it by issuing a
847 * KVM_ARM_VCPU_INIT if it really wants to.
848 */
849 vcpu->arch.target = -1;
850 ret = ARM_EXCEPTION_IL;
851 }
852
853 ret = handle_exit(vcpu, ret);
854 }
855
856 /* Tell userspace about in-kernel device output levels */
857 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
858 kvm_timer_update_run(vcpu);
859 kvm_pmu_update_run(vcpu);
860 }
861
862 kvm_sigset_deactivate(vcpu);
863
864 vcpu_put(vcpu);
865 return ret;
866 }
867
vcpu_interrupt_line(struct kvm_vcpu * vcpu,int number,bool level)868 static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
869 {
870 int bit_index;
871 bool set;
872 unsigned long *hcr;
873
874 if (number == KVM_ARM_IRQ_CPU_IRQ)
875 bit_index = __ffs(HCR_VI);
876 else /* KVM_ARM_IRQ_CPU_FIQ */
877 bit_index = __ffs(HCR_VF);
878
879 hcr = vcpu_hcr(vcpu);
880 if (level)
881 set = test_and_set_bit(bit_index, hcr);
882 else
883 set = test_and_clear_bit(bit_index, hcr);
884
885 /*
886 * If we didn't change anything, no need to wake up or kick other CPUs
887 */
888 if (set == level)
889 return 0;
890
891 /*
892 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
893 * trigger a world-switch round on the running physical CPU to set the
894 * virtual IRQ/FIQ fields in the HCR appropriately.
895 */
896 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
897 kvm_vcpu_kick(vcpu);
898
899 return 0;
900 }
901
kvm_vm_ioctl_irq_line(struct kvm * kvm,struct kvm_irq_level * irq_level,bool line_status)902 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
903 bool line_status)
904 {
905 u32 irq = irq_level->irq;
906 unsigned int irq_type, vcpu_idx, irq_num;
907 int nrcpus = atomic_read(&kvm->online_vcpus);
908 struct kvm_vcpu *vcpu = NULL;
909 bool level = irq_level->level;
910
911 irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
912 vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
913 vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
914 irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
915
916 trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
917
918 switch (irq_type) {
919 case KVM_ARM_IRQ_TYPE_CPU:
920 if (irqchip_in_kernel(kvm))
921 return -ENXIO;
922
923 if (vcpu_idx >= nrcpus)
924 return -EINVAL;
925
926 vcpu = kvm_get_vcpu(kvm, vcpu_idx);
927 if (!vcpu)
928 return -EINVAL;
929
930 if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
931 return -EINVAL;
932
933 return vcpu_interrupt_line(vcpu, irq_num, level);
934 case KVM_ARM_IRQ_TYPE_PPI:
935 if (!irqchip_in_kernel(kvm))
936 return -ENXIO;
937
938 if (vcpu_idx >= nrcpus)
939 return -EINVAL;
940
941 vcpu = kvm_get_vcpu(kvm, vcpu_idx);
942 if (!vcpu)
943 return -EINVAL;
944
945 if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
946 return -EINVAL;
947
948 return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
949 case KVM_ARM_IRQ_TYPE_SPI:
950 if (!irqchip_in_kernel(kvm))
951 return -ENXIO;
952
953 if (irq_num < VGIC_NR_PRIVATE_IRQS)
954 return -EINVAL;
955
956 return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
957 }
958
959 return -EINVAL;
960 }
961
kvm_vcpu_set_target(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)962 static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
963 const struct kvm_vcpu_init *init)
964 {
965 unsigned int i, ret;
966 int phys_target = kvm_target_cpu();
967
968 if (init->target != phys_target)
969 return -EINVAL;
970
971 /*
972 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
973 * use the same target.
974 */
975 if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
976 return -EINVAL;
977
978 /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
979 for (i = 0; i < sizeof(init->features) * 8; i++) {
980 bool set = (init->features[i / 32] & (1 << (i % 32)));
981
982 if (set && i >= KVM_VCPU_MAX_FEATURES)
983 return -ENOENT;
984
985 /*
986 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
987 * use the same feature set.
988 */
989 if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
990 test_bit(i, vcpu->arch.features) != set)
991 return -EINVAL;
992
993 if (set)
994 set_bit(i, vcpu->arch.features);
995 }
996
997 vcpu->arch.target = phys_target;
998
999 /* Now we know what it is, we can reset it. */
1000 ret = kvm_reset_vcpu(vcpu);
1001 if (ret) {
1002 vcpu->arch.target = -1;
1003 bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
1004 }
1005
1006 return ret;
1007 }
1008
kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_vcpu_init * init)1009 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
1010 struct kvm_vcpu_init *init)
1011 {
1012 int ret;
1013
1014 ret = kvm_vcpu_set_target(vcpu, init);
1015 if (ret)
1016 return ret;
1017
1018 /*
1019 * Ensure a rebooted VM will fault in RAM pages and detect if the
1020 * guest MMU is turned off and flush the caches as needed.
1021 *
1022 * S2FWB enforces all memory accesses to RAM being cacheable,
1023 * ensuring that the data side is always coherent. We still
1024 * need to invalidate the I-cache though, as FWB does *not*
1025 * imply CTR_EL0.DIC.
1026 */
1027 if (vcpu->arch.has_run_once) {
1028 if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
1029 stage2_unmap_vm(vcpu->kvm);
1030 else
1031 __flush_icache_all();
1032 }
1033
1034 vcpu_reset_hcr(vcpu);
1035
1036 /*
1037 * Handle the "start in power-off" case.
1038 */
1039 if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
1040 vcpu_power_off(vcpu);
1041 else
1042 vcpu->arch.power_off = false;
1043
1044 return 0;
1045 }
1046
kvm_arm_vcpu_set_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1047 static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
1048 struct kvm_device_attr *attr)
1049 {
1050 int ret = -ENXIO;
1051
1052 switch (attr->group) {
1053 default:
1054 ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
1055 break;
1056 }
1057
1058 return ret;
1059 }
1060
kvm_arm_vcpu_get_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1061 static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
1062 struct kvm_device_attr *attr)
1063 {
1064 int ret = -ENXIO;
1065
1066 switch (attr->group) {
1067 default:
1068 ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
1069 break;
1070 }
1071
1072 return ret;
1073 }
1074
kvm_arm_vcpu_has_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1075 static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
1076 struct kvm_device_attr *attr)
1077 {
1078 int ret = -ENXIO;
1079
1080 switch (attr->group) {
1081 default:
1082 ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
1083 break;
1084 }
1085
1086 return ret;
1087 }
1088
kvm_arm_vcpu_get_events(struct kvm_vcpu * vcpu,struct kvm_vcpu_events * events)1089 static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
1090 struct kvm_vcpu_events *events)
1091 {
1092 memset(events, 0, sizeof(*events));
1093
1094 return __kvm_arm_vcpu_get_events(vcpu, events);
1095 }
1096
kvm_arm_vcpu_set_events(struct kvm_vcpu * vcpu,struct kvm_vcpu_events * events)1097 static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
1098 struct kvm_vcpu_events *events)
1099 {
1100 int i;
1101
1102 /* check whether the reserved field is zero */
1103 for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
1104 if (events->reserved[i])
1105 return -EINVAL;
1106
1107 /* check whether the pad field is zero */
1108 for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
1109 if (events->exception.pad[i])
1110 return -EINVAL;
1111
1112 return __kvm_arm_vcpu_set_events(vcpu, events);
1113 }
1114
kvm_arch_vcpu_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1115 long kvm_arch_vcpu_ioctl(struct file *filp,
1116 unsigned int ioctl, unsigned long arg)
1117 {
1118 struct kvm_vcpu *vcpu = filp->private_data;
1119 void __user *argp = (void __user *)arg;
1120 struct kvm_device_attr attr;
1121 long r;
1122
1123 switch (ioctl) {
1124 case KVM_ARM_VCPU_INIT: {
1125 struct kvm_vcpu_init init;
1126
1127 r = -EFAULT;
1128 if (copy_from_user(&init, argp, sizeof(init)))
1129 break;
1130
1131 r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
1132 break;
1133 }
1134 case KVM_SET_ONE_REG:
1135 case KVM_GET_ONE_REG: {
1136 struct kvm_one_reg reg;
1137
1138 r = -ENOEXEC;
1139 if (unlikely(!kvm_vcpu_initialized(vcpu)))
1140 break;
1141
1142 r = -EFAULT;
1143 if (copy_from_user(®, argp, sizeof(reg)))
1144 break;
1145
1146 /*
1147 * We could owe a reset due to PSCI. Handle the pending reset
1148 * here to ensure userspace register accesses are ordered after
1149 * the reset.
1150 */
1151 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
1152 kvm_reset_vcpu(vcpu);
1153
1154 if (ioctl == KVM_SET_ONE_REG)
1155 r = kvm_arm_set_reg(vcpu, ®);
1156 else
1157 r = kvm_arm_get_reg(vcpu, ®);
1158 break;
1159 }
1160 case KVM_GET_REG_LIST: {
1161 struct kvm_reg_list __user *user_list = argp;
1162 struct kvm_reg_list reg_list;
1163 unsigned n;
1164
1165 r = -ENOEXEC;
1166 if (unlikely(!kvm_vcpu_initialized(vcpu)))
1167 break;
1168
1169 r = -EPERM;
1170 if (!kvm_arm_vcpu_is_finalized(vcpu))
1171 break;
1172
1173 r = -EFAULT;
1174 if (copy_from_user(®_list, user_list, sizeof(reg_list)))
1175 break;
1176 n = reg_list.n;
1177 reg_list.n = kvm_arm_num_regs(vcpu);
1178 if (copy_to_user(user_list, ®_list, sizeof(reg_list)))
1179 break;
1180 r = -E2BIG;
1181 if (n < reg_list.n)
1182 break;
1183 r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
1184 break;
1185 }
1186 case KVM_SET_DEVICE_ATTR: {
1187 r = -EFAULT;
1188 if (copy_from_user(&attr, argp, sizeof(attr)))
1189 break;
1190 r = kvm_arm_vcpu_set_attr(vcpu, &attr);
1191 break;
1192 }
1193 case KVM_GET_DEVICE_ATTR: {
1194 r = -EFAULT;
1195 if (copy_from_user(&attr, argp, sizeof(attr)))
1196 break;
1197 r = kvm_arm_vcpu_get_attr(vcpu, &attr);
1198 break;
1199 }
1200 case KVM_HAS_DEVICE_ATTR: {
1201 r = -EFAULT;
1202 if (copy_from_user(&attr, argp, sizeof(attr)))
1203 break;
1204 r = kvm_arm_vcpu_has_attr(vcpu, &attr);
1205 break;
1206 }
1207 case KVM_GET_VCPU_EVENTS: {
1208 struct kvm_vcpu_events events;
1209
1210 if (kvm_arm_vcpu_get_events(vcpu, &events))
1211 return -EINVAL;
1212
1213 if (copy_to_user(argp, &events, sizeof(events)))
1214 return -EFAULT;
1215
1216 return 0;
1217 }
1218 case KVM_SET_VCPU_EVENTS: {
1219 struct kvm_vcpu_events events;
1220
1221 if (copy_from_user(&events, argp, sizeof(events)))
1222 return -EFAULT;
1223
1224 return kvm_arm_vcpu_set_events(vcpu, &events);
1225 }
1226 case KVM_ARM_VCPU_FINALIZE: {
1227 int what;
1228
1229 if (!kvm_vcpu_initialized(vcpu))
1230 return -ENOEXEC;
1231
1232 if (get_user(what, (const int __user *)argp))
1233 return -EFAULT;
1234
1235 return kvm_arm_vcpu_finalize(vcpu, what);
1236 }
1237 default:
1238 r = -EINVAL;
1239 }
1240
1241 return r;
1242 }
1243
kvm_arch_sync_dirty_log(struct kvm * kvm,struct kvm_memory_slot * memslot)1244 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
1245 {
1246
1247 }
1248
kvm_arch_flush_remote_tlbs_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)1249 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
1250 struct kvm_memory_slot *memslot)
1251 {
1252 kvm_flush_remote_tlbs(kvm);
1253 }
1254
kvm_vm_ioctl_set_device_addr(struct kvm * kvm,struct kvm_arm_device_addr * dev_addr)1255 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
1256 struct kvm_arm_device_addr *dev_addr)
1257 {
1258 unsigned long dev_id, type;
1259
1260 dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
1261 KVM_ARM_DEVICE_ID_SHIFT;
1262 type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
1263 KVM_ARM_DEVICE_TYPE_SHIFT;
1264
1265 switch (dev_id) {
1266 case KVM_ARM_DEVICE_VGIC_V2:
1267 if (!vgic_present)
1268 return -ENXIO;
1269 return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
1270 default:
1271 return -ENODEV;
1272 }
1273 }
1274
kvm_arch_vm_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1275 long kvm_arch_vm_ioctl(struct file *filp,
1276 unsigned int ioctl, unsigned long arg)
1277 {
1278 struct kvm *kvm = filp->private_data;
1279 void __user *argp = (void __user *)arg;
1280
1281 switch (ioctl) {
1282 case KVM_CREATE_IRQCHIP: {
1283 int ret;
1284 if (!vgic_present)
1285 return -ENXIO;
1286 mutex_lock(&kvm->lock);
1287 ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
1288 mutex_unlock(&kvm->lock);
1289 return ret;
1290 }
1291 case KVM_ARM_SET_DEVICE_ADDR: {
1292 struct kvm_arm_device_addr dev_addr;
1293
1294 if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
1295 return -EFAULT;
1296 return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
1297 }
1298 case KVM_ARM_PREFERRED_TARGET: {
1299 int err;
1300 struct kvm_vcpu_init init;
1301
1302 err = kvm_vcpu_preferred_target(&init);
1303 if (err)
1304 return err;
1305
1306 if (copy_to_user(argp, &init, sizeof(init)))
1307 return -EFAULT;
1308
1309 return 0;
1310 }
1311 default:
1312 return -EINVAL;
1313 }
1314 }
1315
nvhe_percpu_size(void)1316 static unsigned long nvhe_percpu_size(void)
1317 {
1318 return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
1319 (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
1320 }
1321
nvhe_percpu_order(void)1322 static unsigned long nvhe_percpu_order(void)
1323 {
1324 unsigned long size = nvhe_percpu_size();
1325
1326 return size ? get_order(size) : 0;
1327 }
1328
kvm_map_vectors(void)1329 static int kvm_map_vectors(void)
1330 {
1331 /*
1332 * SV2 = ARM64_SPECTRE_V2
1333 * HEL2 = ARM64_HARDEN_EL2_VECTORS
1334 *
1335 * !SV2 + !HEL2 -> use direct vectors
1336 * SV2 + !HEL2 -> use hardened vectors in place
1337 * !SV2 + HEL2 -> allocate one vector slot and use exec mapping
1338 * SV2 + HEL2 -> use hardened vectors and use exec mapping
1339 */
1340 if (cpus_have_const_cap(ARM64_SPECTRE_V2) ||
1341 cpus_have_const_cap(ARM64_SPECTRE_BHB)) {
1342 __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
1343 __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
1344 }
1345
1346 if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
1347 phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs);
1348 unsigned long size = __BP_HARDEN_HYP_VECS_SZ;
1349
1350 /*
1351 * Always allocate a spare vector slot, as we don't
1352 * know yet which CPUs have a BP hardening slot that
1353 * we can reuse.
1354 */
1355 __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
1356 BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
1357 return create_hyp_exec_mappings(vect_pa, size,
1358 &__kvm_bp_vect_base);
1359 }
1360
1361 return 0;
1362 }
1363
cpu_init_hyp_mode(void)1364 static void cpu_init_hyp_mode(void)
1365 {
1366 phys_addr_t pgd_ptr;
1367 unsigned long hyp_stack_ptr;
1368 unsigned long vector_ptr;
1369 unsigned long tpidr_el2;
1370 struct arm_smccc_res res;
1371
1372 /* Switch from the HYP stub to our own HYP init vector */
1373 __hyp_set_vectors(kvm_get_idmap_vector());
1374
1375 /*
1376 * Calculate the raw per-cpu offset without a translation from the
1377 * kernel's mapping to the linear mapping, and store it in tpidr_el2
1378 * so that we can use adr_l to access per-cpu variables in EL2.
1379 */
1380 tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
1381 (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
1382
1383 pgd_ptr = kvm_mmu_get_httbr();
1384 hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
1385 hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr);
1386 vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
1387
1388 /*
1389 * Call initialization code, and switch to the full blown HYP code.
1390 * If the cpucaps haven't been finalized yet, something has gone very
1391 * wrong, and hyp will crash and burn when it uses any
1392 * cpus_have_const_cap() wrapper.
1393 */
1394 BUG_ON(!system_capabilities_finalized());
1395 arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init),
1396 pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res);
1397 WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
1398
1399 /*
1400 * Disabling SSBD on a non-VHE system requires us to enable SSBS
1401 * at EL2.
1402 */
1403 if (this_cpu_has_cap(ARM64_SSBS) &&
1404 arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
1405 kvm_call_hyp_nvhe(__kvm_enable_ssbs);
1406 }
1407 }
1408
cpu_hyp_reset(void)1409 static void cpu_hyp_reset(void)
1410 {
1411 if (!is_kernel_in_hyp_mode())
1412 __hyp_reset_vectors();
1413 }
1414
cpu_hyp_reinit(void)1415 static void cpu_hyp_reinit(void)
1416 {
1417 kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
1418
1419 cpu_hyp_reset();
1420
1421 *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector();
1422
1423 if (is_kernel_in_hyp_mode())
1424 kvm_timer_init_vhe();
1425 else
1426 cpu_init_hyp_mode();
1427
1428 kvm_arm_init_debug();
1429
1430 if (vgic_present)
1431 kvm_vgic_init_cpu_hardware();
1432 }
1433
_kvm_arch_hardware_enable(void * discard)1434 static void _kvm_arch_hardware_enable(void *discard)
1435 {
1436 if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
1437 cpu_hyp_reinit();
1438 __this_cpu_write(kvm_arm_hardware_enabled, 1);
1439 }
1440 }
1441
kvm_arch_hardware_enable(void)1442 int kvm_arch_hardware_enable(void)
1443 {
1444 _kvm_arch_hardware_enable(NULL);
1445 return 0;
1446 }
1447
_kvm_arch_hardware_disable(void * discard)1448 static void _kvm_arch_hardware_disable(void *discard)
1449 {
1450 if (__this_cpu_read(kvm_arm_hardware_enabled)) {
1451 cpu_hyp_reset();
1452 __this_cpu_write(kvm_arm_hardware_enabled, 0);
1453 }
1454 }
1455
kvm_arch_hardware_disable(void)1456 void kvm_arch_hardware_disable(void)
1457 {
1458 _kvm_arch_hardware_disable(NULL);
1459 }
1460
1461 #ifdef CONFIG_CPU_PM
hyp_init_cpu_pm_notifier(struct notifier_block * self,unsigned long cmd,void * v)1462 static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
1463 unsigned long cmd,
1464 void *v)
1465 {
1466 /*
1467 * kvm_arm_hardware_enabled is left with its old value over
1468 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
1469 * re-enable hyp.
1470 */
1471 switch (cmd) {
1472 case CPU_PM_ENTER:
1473 if (__this_cpu_read(kvm_arm_hardware_enabled))
1474 /*
1475 * don't update kvm_arm_hardware_enabled here
1476 * so that the hardware will be re-enabled
1477 * when we resume. See below.
1478 */
1479 cpu_hyp_reset();
1480
1481 return NOTIFY_OK;
1482 case CPU_PM_ENTER_FAILED:
1483 case CPU_PM_EXIT:
1484 if (__this_cpu_read(kvm_arm_hardware_enabled))
1485 /* The hardware was enabled before suspend. */
1486 cpu_hyp_reinit();
1487
1488 return NOTIFY_OK;
1489
1490 default:
1491 return NOTIFY_DONE;
1492 }
1493 }
1494
1495 static struct notifier_block hyp_init_cpu_pm_nb = {
1496 .notifier_call = hyp_init_cpu_pm_notifier,
1497 };
1498
hyp_cpu_pm_init(void)1499 static void __init hyp_cpu_pm_init(void)
1500 {
1501 cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
1502 }
hyp_cpu_pm_exit(void)1503 static void __init hyp_cpu_pm_exit(void)
1504 {
1505 cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
1506 }
1507 #else
hyp_cpu_pm_init(void)1508 static inline void hyp_cpu_pm_init(void)
1509 {
1510 }
hyp_cpu_pm_exit(void)1511 static inline void hyp_cpu_pm_exit(void)
1512 {
1513 }
1514 #endif
1515
init_common_resources(void)1516 static int init_common_resources(void)
1517 {
1518 return kvm_set_ipa_limit();
1519 }
1520
init_subsystems(void)1521 static int init_subsystems(void)
1522 {
1523 int err = 0;
1524
1525 /*
1526 * Enable hardware so that subsystem initialisation can access EL2.
1527 */
1528 on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
1529
1530 /*
1531 * Register CPU lower-power notifier
1532 */
1533 hyp_cpu_pm_init();
1534
1535 /*
1536 * Init HYP view of VGIC
1537 */
1538 err = kvm_vgic_hyp_init();
1539 switch (err) {
1540 case 0:
1541 vgic_present = true;
1542 break;
1543 case -ENODEV:
1544 case -ENXIO:
1545 vgic_present = false;
1546 err = 0;
1547 break;
1548 default:
1549 goto out;
1550 }
1551
1552 /*
1553 * Init HYP architected timer support
1554 */
1555 err = kvm_timer_hyp_init(vgic_present);
1556 if (err)
1557 goto out;
1558
1559 kvm_perf_init();
1560 kvm_coproc_table_init();
1561
1562 out:
1563 on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
1564
1565 return err;
1566 }
1567
teardown_hyp_mode(void)1568 static void teardown_hyp_mode(void)
1569 {
1570 int cpu;
1571
1572 free_hyp_pgds();
1573 for_each_possible_cpu(cpu) {
1574 free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
1575 free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
1576 }
1577 }
1578
1579 /**
1580 * Inits Hyp-mode on all online CPUs
1581 */
init_hyp_mode(void)1582 static int init_hyp_mode(void)
1583 {
1584 int cpu;
1585 int err = 0;
1586
1587 /*
1588 * Allocate Hyp PGD and setup Hyp identity mapping
1589 */
1590 err = kvm_mmu_init();
1591 if (err)
1592 goto out_err;
1593
1594 /*
1595 * Allocate stack pages for Hypervisor-mode
1596 */
1597 for_each_possible_cpu(cpu) {
1598 unsigned long stack_page;
1599
1600 stack_page = __get_free_page(GFP_KERNEL);
1601 if (!stack_page) {
1602 err = -ENOMEM;
1603 goto out_err;
1604 }
1605
1606 per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
1607 }
1608
1609 /*
1610 * Allocate and initialize pages for Hypervisor-mode percpu regions.
1611 */
1612 for_each_possible_cpu(cpu) {
1613 struct page *page;
1614 void *page_addr;
1615
1616 page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
1617 if (!page) {
1618 err = -ENOMEM;
1619 goto out_err;
1620 }
1621
1622 page_addr = page_address(page);
1623 memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
1624 kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
1625 }
1626
1627 /*
1628 * Map the Hyp-code called directly from the host
1629 */
1630 err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
1631 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
1632 if (err) {
1633 kvm_err("Cannot map world-switch code\n");
1634 goto out_err;
1635 }
1636
1637 err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
1638 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
1639 if (err) {
1640 kvm_err("Cannot map rodata section\n");
1641 goto out_err;
1642 }
1643
1644 err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
1645 kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
1646 if (err) {
1647 kvm_err("Cannot map bss section\n");
1648 goto out_err;
1649 }
1650
1651 err = kvm_map_vectors();
1652 if (err) {
1653 kvm_err("Cannot map vectors\n");
1654 goto out_err;
1655 }
1656
1657 /*
1658 * Map the Hyp stack pages
1659 */
1660 for_each_possible_cpu(cpu) {
1661 char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
1662 err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
1663 PAGE_HYP);
1664
1665 if (err) {
1666 kvm_err("Cannot map hyp stack\n");
1667 goto out_err;
1668 }
1669 }
1670
1671 /*
1672 * Map Hyp percpu pages
1673 */
1674 for_each_possible_cpu(cpu) {
1675 char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
1676 char *percpu_end = percpu_begin + nvhe_percpu_size();
1677
1678 err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
1679
1680 if (err) {
1681 kvm_err("Cannot map hyp percpu region\n");
1682 goto out_err;
1683 }
1684 }
1685
1686 return 0;
1687
1688 out_err:
1689 teardown_hyp_mode();
1690 kvm_err("error initializing Hyp mode: %d\n", err);
1691 return err;
1692 }
1693
check_kvm_target_cpu(void * ret)1694 static void check_kvm_target_cpu(void *ret)
1695 {
1696 *(int *)ret = kvm_target_cpu();
1697 }
1698
kvm_mpidr_to_vcpu(struct kvm * kvm,unsigned long mpidr)1699 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
1700 {
1701 struct kvm_vcpu *vcpu;
1702 int i;
1703
1704 mpidr &= MPIDR_HWID_BITMASK;
1705 kvm_for_each_vcpu(i, vcpu, kvm) {
1706 if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
1707 return vcpu;
1708 }
1709 return NULL;
1710 }
1711
kvm_arch_has_irq_bypass(void)1712 bool kvm_arch_has_irq_bypass(void)
1713 {
1714 return true;
1715 }
1716
kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer * cons,struct irq_bypass_producer * prod)1717 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
1718 struct irq_bypass_producer *prod)
1719 {
1720 struct kvm_kernel_irqfd *irqfd =
1721 container_of(cons, struct kvm_kernel_irqfd, consumer);
1722
1723 return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
1724 &irqfd->irq_entry);
1725 }
kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer * cons,struct irq_bypass_producer * prod)1726 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
1727 struct irq_bypass_producer *prod)
1728 {
1729 struct kvm_kernel_irqfd *irqfd =
1730 container_of(cons, struct kvm_kernel_irqfd, consumer);
1731
1732 kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
1733 &irqfd->irq_entry);
1734 }
1735
kvm_arch_irq_bypass_stop(struct irq_bypass_consumer * cons)1736 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
1737 {
1738 struct kvm_kernel_irqfd *irqfd =
1739 container_of(cons, struct kvm_kernel_irqfd, consumer);
1740
1741 kvm_arm_halt_guest(irqfd->kvm);
1742 }
1743
kvm_arch_irq_bypass_start(struct irq_bypass_consumer * cons)1744 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
1745 {
1746 struct kvm_kernel_irqfd *irqfd =
1747 container_of(cons, struct kvm_kernel_irqfd, consumer);
1748
1749 kvm_arm_resume_guest(irqfd->kvm);
1750 }
1751
1752 /**
1753 * Initialize Hyp-mode and memory mappings on all CPUs.
1754 */
kvm_arch_init(void * opaque)1755 int kvm_arch_init(void *opaque)
1756 {
1757 int err;
1758 int ret, cpu;
1759 bool in_hyp_mode;
1760
1761 if (!is_hyp_mode_available()) {
1762 kvm_info("HYP mode not available\n");
1763 return -ENODEV;
1764 }
1765
1766 in_hyp_mode = is_kernel_in_hyp_mode();
1767
1768 if (!in_hyp_mode && kvm_arch_requires_vhe()) {
1769 kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
1770 return -ENODEV;
1771 }
1772
1773 if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
1774 cpus_have_final_cap(ARM64_WORKAROUND_1508412))
1775 kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
1776 "Only trusted guests should be used on this system.\n");
1777
1778 for_each_online_cpu(cpu) {
1779 smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
1780 if (ret < 0) {
1781 kvm_err("Error, CPU %d not supported!\n", cpu);
1782 return -ENODEV;
1783 }
1784 }
1785
1786 err = init_common_resources();
1787 if (err)
1788 return err;
1789
1790 err = kvm_arm_init_sve();
1791 if (err)
1792 return err;
1793
1794 if (!in_hyp_mode) {
1795 err = init_hyp_mode();
1796 if (err)
1797 goto out_err;
1798 }
1799
1800 err = init_subsystems();
1801 if (err)
1802 goto out_hyp;
1803
1804 if (in_hyp_mode)
1805 kvm_info("VHE mode initialized successfully\n");
1806 else
1807 kvm_info("Hyp mode initialized successfully\n");
1808
1809 return 0;
1810
1811 out_hyp:
1812 hyp_cpu_pm_exit();
1813 if (!in_hyp_mode)
1814 teardown_hyp_mode();
1815 out_err:
1816 return err;
1817 }
1818
1819 /* NOP: Compiling as a module not supported */
kvm_arch_exit(void)1820 void kvm_arch_exit(void)
1821 {
1822 kvm_perf_teardown();
1823 }
1824
arm_init(void)1825 static int arm_init(void)
1826 {
1827 int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
1828 return rc;
1829 }
1830
1831 module_init(arm_init);
1832