1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2021 Google LLC
4 * Author: Fuad Tabba <tabba@google.com>
5 */
6
7 #include <asm/kvm_arm.h>
8 #include <asm/kvm_asm.h>
9 #include <asm/kvm_host.h>
10 #include <asm/kvm_mmu.h>
11 #include <asm/memory.h>
12
13 #include <linux/kvm_host.h>
14 #include <linux/mm.h>
15
16 #include <kvm/arm_hypercalls.h>
17 #include <kvm/arm_psci.h>
18
19 #include <nvhe/mem_protect.h>
20 #include <nvhe/mm.h>
21 #include <nvhe/pkvm.h>
22 #include <nvhe/trap_handler.h>
23
24 /* Used by icache_is_vpipt(). */
25 unsigned long __icache_flags;
26
27 /*
28 * Set trap register values based on features in ID_AA64PFR0.
29 */
pvm_init_traps_aa64pfr0(struct kvm_vcpu * vcpu)30 static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
31 {
32 const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1);
33 u64 hcr_set = HCR_RW;
34 u64 hcr_clear = 0;
35 u64 cptr_set = 0;
36
37 /* Protected KVM does not support AArch32 guests. */
38 BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL0),
39 PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_ELx_64BIT_ONLY);
40 BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1),
41 PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_ELx_64BIT_ONLY);
42
43 /*
44 * Linux guests assume support for floating-point and Advanced SIMD. Do
45 * not change the trapping behavior for these from the KVM default.
46 */
47 BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_FP),
48 PVM_ID_AA64PFR0_ALLOW));
49 BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD),
50 PVM_ID_AA64PFR0_ALLOW));
51
52 /* Trap RAS unless all current versions are supported */
53 if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_RAS), feature_ids) <
54 ID_AA64PFR0_RAS_V1P1) {
55 hcr_set |= HCR_TERR | HCR_TEA;
56 hcr_clear |= HCR_FIEN;
57 }
58
59 /* Trap AMU */
60 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_AMU), feature_ids)) {
61 hcr_clear |= HCR_AMVOFFEN;
62 cptr_set |= CPTR_EL2_TAM;
63 }
64
65 /* Trap SVE */
66 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_SVE), feature_ids))
67 cptr_set |= CPTR_EL2_TZ;
68
69 vcpu->arch.hcr_el2 |= hcr_set;
70 vcpu->arch.hcr_el2 &= ~hcr_clear;
71 vcpu->arch.cptr_el2 |= cptr_set;
72 }
73
74 /*
75 * Set trap register values based on features in ID_AA64PFR1.
76 */
pvm_init_traps_aa64pfr1(struct kvm_vcpu * vcpu)77 static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu)
78 {
79 const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1);
80 u64 hcr_set = 0;
81 u64 hcr_clear = 0;
82
83 /* Memory Tagging: Trap and Treat as Untagged if not supported. */
84 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), feature_ids)) {
85 hcr_set |= HCR_TID5;
86 hcr_clear |= HCR_DCT | HCR_ATA;
87 }
88
89 vcpu->arch.hcr_el2 |= hcr_set;
90 vcpu->arch.hcr_el2 &= ~hcr_clear;
91 }
92
93 /*
94 * Set trap register values based on features in ID_AA64DFR0.
95 */
pvm_init_traps_aa64dfr0(struct kvm_vcpu * vcpu)96 static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu)
97 {
98 const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1);
99 u64 mdcr_set = 0;
100 u64 mdcr_clear = 0;
101 u64 cptr_set = 0;
102
103 /* Trap/constrain PMU */
104 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMUVER), feature_ids)) {
105 mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR;
106 mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME |
107 MDCR_EL2_HPMN_MASK;
108 }
109
110 /* Trap Debug */
111 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), feature_ids))
112 mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA;
113
114 /* Trap OS Double Lock */
115 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DOUBLELOCK), feature_ids))
116 mdcr_set |= MDCR_EL2_TDOSA;
117
118 /* Trap SPE */
119 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER), feature_ids)) {
120 mdcr_set |= MDCR_EL2_TPMS;
121 mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
122 }
123
124 /* Trap Trace Filter */
125 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_TRACE_FILT), feature_ids))
126 mdcr_set |= MDCR_EL2_TTRF;
127
128 /* Trap Trace */
129 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_TRACEVER), feature_ids))
130 cptr_set |= CPTR_EL2_TTA;
131
132 vcpu->arch.mdcr_el2 |= mdcr_set;
133 vcpu->arch.mdcr_el2 &= ~mdcr_clear;
134 vcpu->arch.cptr_el2 |= cptr_set;
135 }
136
137 /*
138 * Set trap register values based on features in ID_AA64MMFR0.
139 */
pvm_init_traps_aa64mmfr0(struct kvm_vcpu * vcpu)140 static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu)
141 {
142 const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1);
143 u64 mdcr_set = 0;
144
145 /* Trap Debug Communications Channel registers */
146 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_FGT), feature_ids))
147 mdcr_set |= MDCR_EL2_TDCC;
148
149 vcpu->arch.mdcr_el2 |= mdcr_set;
150 }
151
152 /*
153 * Set trap register values based on features in ID_AA64MMFR1.
154 */
pvm_init_traps_aa64mmfr1(struct kvm_vcpu * vcpu)155 static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu)
156 {
157 const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1);
158 u64 hcr_set = 0;
159
160 /* Trap LOR */
161 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_LOR), feature_ids))
162 hcr_set |= HCR_TLOR;
163
164 vcpu->arch.hcr_el2 |= hcr_set;
165 }
166
167 /*
168 * Set baseline trap register values.
169 */
pvm_init_trap_regs(struct kvm_vcpu * vcpu)170 static void pvm_init_trap_regs(struct kvm_vcpu *vcpu)
171 {
172 vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
173 vcpu->arch.mdcr_el2 = 0;
174
175 /*
176 * Always trap:
177 * - Feature id registers: to control features exposed to guests
178 * - Implementation-defined features
179 */
180 vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS |
181 HCR_TID3 | HCR_TACR | HCR_TIDCP | HCR_TID1;
182
183 if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) {
184 /* route synchronous external abort exceptions to EL2 */
185 vcpu->arch.hcr_el2 |= HCR_TEA;
186 /* trap error record accesses */
187 vcpu->arch.hcr_el2 |= HCR_TERR;
188 }
189
190 if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
191 vcpu->arch.hcr_el2 |= HCR_FWB;
192
193 if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE))
194 vcpu->arch.hcr_el2 |= HCR_TID2;
195 }
196
197 /*
198 * Initialize trap register values for protected VMs.
199 */
pkvm_vcpu_init_traps(struct kvm_vcpu * vcpu)200 static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
201 {
202 pvm_init_trap_regs(vcpu);
203 pvm_init_traps_aa64pfr0(vcpu);
204 pvm_init_traps_aa64pfr1(vcpu);
205 pvm_init_traps_aa64dfr0(vcpu);
206 pvm_init_traps_aa64mmfr0(vcpu);
207 pvm_init_traps_aa64mmfr1(vcpu);
208 }
209
210 /*
211 * Start the shadow table handle at the offset defined instead of at 0.
212 * Mainly for sanity checking and debugging.
213 */
214 #define HANDLE_OFFSET 0x1000
215
shadow_handle_to_index(int shadow_handle)216 static int shadow_handle_to_index(int shadow_handle)
217 {
218 return shadow_handle - HANDLE_OFFSET;
219 }
220
index_to_shadow_handle(int index)221 static int index_to_shadow_handle(int index)
222 {
223 return index + HANDLE_OFFSET;
224 }
225
226 extern unsigned long hyp_nr_cpus;
227
228 /*
229 * Track the vcpu most recently loaded on each physical CPU.
230 */
231 static DEFINE_PER_CPU(struct kvm_vcpu *, last_loaded_vcpu);
232
233 /*
234 * Spinlock for protecting the shadow table related state.
235 * Protects writes to shadow_table, num_shadow_entries, and next_shadow_alloc,
236 * as well as reads and writes to last_shadow_vcpu_lookup.
237 */
238 static DEFINE_HYP_SPINLOCK(shadow_lock);
239
240 /*
241 * The table of shadow entries for protected VMs in hyp.
242 * Allocated at hyp initialization and setup.
243 */
244 static struct kvm_shadow_vm **shadow_table;
245
246 /* Current number of vms in the shadow table. */
247 static int num_shadow_entries;
248
249 /* The next entry index to try to allocate from. */
250 static int next_shadow_alloc;
251
hyp_shadow_table_init(void * tbl)252 void hyp_shadow_table_init(void *tbl)
253 {
254 WARN_ON(shadow_table);
255 shadow_table = tbl;
256 }
257
258 /*
259 * Return the shadow vm corresponding to the handle.
260 */
find_shadow_by_handle(int shadow_handle)261 static struct kvm_shadow_vm *find_shadow_by_handle(int shadow_handle)
262 {
263 int shadow_index = shadow_handle_to_index(shadow_handle);
264
265 if (unlikely(shadow_index < 0 || shadow_index >= KVM_MAX_PVMS))
266 return NULL;
267
268 return shadow_table[shadow_index];
269 }
270
get_shadow_vcpu(int shadow_handle,unsigned int vcpu_idx)271 struct kvm_vcpu *get_shadow_vcpu(int shadow_handle, unsigned int vcpu_idx)
272 {
273 struct kvm_vcpu *vcpu = NULL;
274 struct kvm_shadow_vm *vm;
275 bool flush_context = false;
276
277 hyp_spin_lock(&shadow_lock);
278 vm = find_shadow_by_handle(shadow_handle);
279 if (!vm || vm->nr_vcpus <= vcpu_idx)
280 goto unlock;
281 vcpu = &vm->shadow_vcpus[vcpu_idx]->vcpu;
282
283 /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */
284 if (unlikely(vcpu->arch.pkvm.loaded_on_cpu)) {
285 vcpu = NULL;
286 goto unlock;
287 }
288
289 /*
290 * Guarantee that both TLBs and I-cache are private to each vcpu.
291 * The check below is conservative and could lead to over-invalidation,
292 * because there is no need to nuke the contexts if the vcpu belongs to
293 * a different vm.
294 */
295 if (vcpu != __this_cpu_read(last_loaded_vcpu)) {
296 flush_context = true;
297 __this_cpu_write(last_loaded_vcpu, vcpu);
298 }
299
300 vcpu->arch.pkvm.loaded_on_cpu = true;
301
302 hyp_page_ref_inc(hyp_virt_to_page(vm));
303 unlock:
304 hyp_spin_unlock(&shadow_lock);
305
306 /* No need for the lock while flushing the context. */
307 if (flush_context)
308 __kvm_flush_cpu_context(vcpu->arch.hw_mmu);
309
310 return vcpu;
311 }
312
put_shadow_vcpu(struct kvm_vcpu * vcpu)313 void put_shadow_vcpu(struct kvm_vcpu *vcpu)
314 {
315 struct kvm_shadow_vm *vm = vcpu->arch.pkvm.shadow_vm;
316
317 hyp_spin_lock(&shadow_lock);
318 vcpu->arch.pkvm.loaded_on_cpu = false;
319 hyp_page_ref_dec(hyp_virt_to_page(vm));
320 hyp_spin_unlock(&shadow_lock);
321 }
322
323 /* Check and copy the supported features for the vcpu from the host. */
copy_features(struct kvm_vcpu * shadow_vcpu,struct kvm_vcpu * host_vcpu)324 static int copy_features(struct kvm_vcpu *shadow_vcpu, struct kvm_vcpu *host_vcpu)
325 {
326 DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES);
327
328 bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES);
329
330 /*
331 * Always allowed:
332 * - CPU starting in poweroff state
333 * - PSCI v0.2
334 */
335 set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features);
336 set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features);
337
338 /*
339 * Check if remaining features are allowed:
340 * - Performance Monitoring
341 * - Pointer Authentication
342 */
343 if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMUVER), PVM_ID_AA64DFR0_ALLOW))
344 set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features);
345
346 if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_API), PVM_ID_AA64ISAR1_ALLOW) &&
347 FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA), PVM_ID_AA64ISAR1_ALLOW))
348 set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features);
349
350 if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI), PVM_ID_AA64ISAR1_ALLOW) &&
351 FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA), PVM_ID_AA64ISAR1_ALLOW))
352 set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features);
353
354 bitmap_and(shadow_vcpu->arch.features, host_vcpu->arch.features,
355 allowed_features, KVM_VCPU_MAX_FEATURES);
356
357 /*
358 * Check for system support for address/generic pointer authentication
359 * features if either are enabled.
360 */
361 if ((test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, shadow_vcpu->arch.features) ||
362 test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, shadow_vcpu->arch.features)) &&
363 !system_has_full_ptr_auth())
364 return -EINVAL;
365
366 return 0;
367 }
368
unpin_host_vcpu(struct shadow_vcpu_state * shadow_vcpu)369 static void unpin_host_vcpu(struct shadow_vcpu_state *shadow_vcpu)
370 {
371 struct kvm_vcpu *host_vcpu = shadow_vcpu->vcpu.arch.pkvm.host_vcpu;
372 size_t sve_state_size;
373 void *sve_state = shadow_vcpu->vcpu.arch.sve_state;
374
375 hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
376
377 if (!sve_state)
378 return;
379
380 sve_state = kern_hyp_va(sve_state);
381 sve_state_size = vcpu_sve_state_size(&shadow_vcpu->vcpu);
382 hyp_unpin_shared_mem(sve_state, sve_state + sve_state_size);
383 }
384
unpin_host_vcpus(struct shadow_vcpu_state * shadow_vcpus[],int nr_vcpus)385 static void unpin_host_vcpus(struct shadow_vcpu_state *shadow_vcpus[], int nr_vcpus)
386 {
387 int i;
388
389 for (i = 0; i < nr_vcpus; i++)
390 unpin_host_vcpu(shadow_vcpus[i]);
391 }
392
init_ptrauth(struct kvm_vcpu * shadow_vcpu)393 static int init_ptrauth(struct kvm_vcpu *shadow_vcpu)
394 {
395 int ret = 0;
396 if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, shadow_vcpu->arch.features) ||
397 test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, shadow_vcpu->arch.features))
398 ret = kvm_vcpu_enable_ptrauth(shadow_vcpu);
399 return ret;
400 }
401
init_shadow_vm(struct kvm * kvm,struct kvm_shadow_vm * vm,int nr_vcpus)402 static void init_shadow_vm(struct kvm *kvm, struct kvm_shadow_vm *vm,
403 int nr_vcpus)
404 {
405 vm->host_kvm = kvm;
406 vm->created_vcpus = nr_vcpus;
407 vm->arch.pkvm.pvmfw_load_addr = kvm->arch.pkvm.pvmfw_load_addr;
408 vm->arch.pkvm.enabled = READ_ONCE(kvm->arch.pkvm.enabled);
409 }
410
init_shadow_vcpu(struct shadow_vcpu_state * shadow_state,struct kvm_vcpu * host_vcpu,struct kvm_shadow_vm * vm,int vcpu_idx)411 static int init_shadow_vcpu(struct shadow_vcpu_state *shadow_state,
412 struct kvm_vcpu *host_vcpu,
413 struct kvm_shadow_vm *vm, int vcpu_idx)
414 {
415 struct kvm_vcpu *shadow_vcpu = &shadow_state->vcpu;
416 int ret;
417
418 host_vcpu = kern_hyp_va(host_vcpu);
419 if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
420 return -EBUSY;
421
422 if (host_vcpu->vcpu_idx != vcpu_idx) {
423 ret = -EINVAL;
424 goto done;
425 }
426
427 shadow_vcpu->arch.pkvm.host_vcpu = host_vcpu;
428 shadow_vcpu->kvm = vm->host_kvm;
429 shadow_vcpu->vcpu_id = host_vcpu->vcpu_id;
430 shadow_vcpu->vcpu_idx = vcpu_idx;
431
432 ret = copy_features(shadow_vcpu, host_vcpu);
433 if (ret)
434 goto done;
435
436 ret = init_ptrauth(shadow_vcpu);
437 if (ret)
438 goto done;
439
440 if (test_bit(KVM_ARM_VCPU_SVE, shadow_vcpu->arch.features)) {
441 size_t sve_state_size;
442 void *sve_state;
443
444 shadow_vcpu->arch.sve_state =
445 READ_ONCE(host_vcpu->arch.sve_state);
446 shadow_vcpu->arch.sve_max_vl =
447 READ_ONCE(host_vcpu->arch.sve_max_vl);
448
449 sve_state = kern_hyp_va(shadow_vcpu->arch.sve_state);
450 sve_state_size = vcpu_sve_state_size(shadow_vcpu);
451
452 if (!shadow_vcpu->arch.sve_state || !sve_state_size ||
453 hyp_pin_shared_mem(sve_state, sve_state + sve_state_size)) {
454 clear_bit(KVM_ARM_VCPU_SVE, shadow_vcpu->arch.features);
455 shadow_vcpu->arch.sve_state = NULL;
456 shadow_vcpu->arch.sve_max_vl = 0;
457 ret = -EINVAL;
458 goto done;
459 }
460 }
461
462 if (vm->arch.pkvm.enabled)
463 pkvm_vcpu_init_traps(shadow_vcpu);
464 kvm_reset_pvm_sys_regs(shadow_vcpu);
465
466 vm->vcpus[vcpu_idx] = shadow_vcpu;
467 shadow_state->vm = vm;
468
469 shadow_vcpu->arch.hw_mmu = &vm->arch.mmu;
470 shadow_vcpu->arch.pkvm.shadow_vm = vm;
471 shadow_vcpu->arch.power_off = true;
472
473 if (test_bit(KVM_ARM_VCPU_POWER_OFF, shadow_vcpu->arch.features)) {
474 shadow_vcpu->arch.pkvm.power_state =
475 PSCI_0_2_AFFINITY_LEVEL_OFF;
476 } else if (pvm_has_pvmfw(vm)) {
477 if (vm->pvmfw_entry_vcpu) {
478 ret = -EINVAL;
479 goto done;
480 }
481
482 vm->pvmfw_entry_vcpu = shadow_vcpu;
483 shadow_vcpu->arch.reset_state.reset = true;
484 shadow_vcpu->arch.pkvm.power_state =
485 PSCI_0_2_AFFINITY_LEVEL_ON_PENDING;
486 } else {
487 struct vcpu_reset_state *reset_state =
488 &shadow_vcpu->arch.reset_state;
489
490 reset_state->pc = *vcpu_pc(host_vcpu);
491 reset_state->r0 = vcpu_get_reg(host_vcpu, 0);
492 reset_state->reset = true;
493 shadow_vcpu->arch.pkvm.power_state =
494 PSCI_0_2_AFFINITY_LEVEL_ON_PENDING;
495 }
496
497 done:
498 if (ret)
499 unpin_host_vcpu(shadow_state);
500
501 return ret;
502 }
503
__exists_shadow(struct kvm * host_kvm)504 static bool __exists_shadow(struct kvm *host_kvm)
505 {
506 int i;
507 int num_checked = 0;
508
509 for (i = 0; i < KVM_MAX_PVMS && num_checked < num_shadow_entries; i++) {
510 if (!shadow_table[i])
511 continue;
512
513 if (unlikely(shadow_table[i]->host_kvm == host_kvm))
514 return true;
515
516 num_checked++;
517 }
518
519 return false;
520 }
521
522 /*
523 * Allocate a shadow table entry and insert a pointer to the shadow vm.
524 *
525 * Return a unique handle to the protected VM on success,
526 * negative error code on failure.
527 */
insert_shadow_table(struct kvm * kvm,struct kvm_shadow_vm * vm,size_t shadow_size)528 static int insert_shadow_table(struct kvm *kvm, struct kvm_shadow_vm *vm,
529 size_t shadow_size)
530 {
531 struct kvm_s2_mmu *mmu = &vm->arch.mmu;
532 int shadow_handle;
533 int vmid;
534
535 hyp_assert_lock_held(&shadow_lock);
536
537 if (unlikely(num_shadow_entries >= KVM_MAX_PVMS))
538 return -ENOMEM;
539
540 /*
541 * Initializing protected state might have failed, yet a malicious host
542 * could trigger this function. Thus, ensure that shadow_table exists.
543 */
544 if (unlikely(!shadow_table))
545 return -EINVAL;
546
547 /* Check that a shadow hasn't been created before for this host KVM. */
548 if (unlikely(__exists_shadow(kvm)))
549 return -EEXIST;
550
551 /* Find the next free entry in the shadow table. */
552 while (shadow_table[next_shadow_alloc])
553 next_shadow_alloc = (next_shadow_alloc + 1) % KVM_MAX_PVMS;
554 shadow_handle = index_to_shadow_handle(next_shadow_alloc);
555
556 vm->shadow_handle = shadow_handle;
557 vm->shadow_area_size = shadow_size;
558
559 /* VMID 0 is reserved for the host */
560 vmid = next_shadow_alloc + 1;
561 if (vmid > 0xff)
562 return -ENOMEM;
563
564 mmu->vmid.vmid = vmid;
565 mmu->vmid.vmid_gen = 0;
566 mmu->arch = &vm->arch;
567 mmu->pgt = &vm->pgt;
568
569 shadow_table[next_shadow_alloc] = vm;
570 next_shadow_alloc = (next_shadow_alloc + 1) % KVM_MAX_PVMS;
571 num_shadow_entries++;
572
573 return shadow_handle;
574 }
575
576 /*
577 * Deallocate and remove the shadow table entry corresponding to the handle.
578 */
remove_shadow_table(int shadow_handle)579 static void remove_shadow_table(int shadow_handle)
580 {
581 hyp_assert_lock_held(&shadow_lock);
582 shadow_table[shadow_handle_to_index(shadow_handle)] = NULL;
583 num_shadow_entries--;
584 }
585
pkvm_get_shadow_size(int num_vcpus)586 static size_t pkvm_get_shadow_size(int num_vcpus)
587 {
588 /* Shadow space for the vm struct and all of its vcpu states. */
589 return sizeof(struct kvm_shadow_vm) +
590 sizeof(struct shadow_vcpu_state *) * num_vcpus;
591 }
592
593 /*
594 * Check whether the size of the area donated by the host is sufficient for
595 * the shadow structues required for nr_vcpus as well as the shadow vm.
596 */
check_shadow_size(int nr_vcpus,size_t shadow_size)597 static int check_shadow_size(int nr_vcpus, size_t shadow_size)
598 {
599 if (nr_vcpus < 1 || nr_vcpus > KVM_MAX_VCPUS)
600 return -EINVAL;
601
602 /*
603 * Shadow size is rounded up when allocated and donated by the host,
604 * so it's likely to be larger than the sum of the struct sizes.
605 */
606 if (shadow_size < pkvm_get_shadow_size(nr_vcpus))
607 return -EINVAL;
608
609 return 0;
610 }
611
drain_shadow_vcpus(struct shadow_vcpu_state * shadow_vcpus[],unsigned int nr_vcpus,struct kvm_hyp_memcache * mc)612 static void drain_shadow_vcpus(struct shadow_vcpu_state *shadow_vcpus[],
613 unsigned int nr_vcpus,
614 struct kvm_hyp_memcache *mc)
615 {
616 int i;
617
618 for (i = 0; i < nr_vcpus; i++) {
619 struct kvm_vcpu *shadow_vcpu = &shadow_vcpus[i]->vcpu;
620 struct kvm_hyp_memcache *vcpu_mc = &shadow_vcpu->arch.pkvm_memcache;
621 void *addr;
622
623 while (vcpu_mc->nr_pages) {
624 addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt);
625 push_hyp_memcache(mc, addr, hyp_virt_to_phys);
626 WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
627 }
628 }
629 }
630
631 /*
632 * Initialize the shadow copy of the protected VM state using the memory
633 * donated by the host.
634 *
635 * Unmaps the donated memory from the host at stage 2.
636 *
637 * kvm: A pointer to the host's struct kvm (host va).
638 * shadow_va: The host va of the area being donated for the shadow state.
639 * Must be page aligned.
640 * shadow_size: The size of the area being donated for the shadow state.
641 * Must be a multiple of the page size.
642 * pgd: The host va of the area being donated for the stage-2 PGD for the VM.
643 * Must be page aligned. Its size is implied by the VM's VTCR.
644 *
645 * Return a unique handle to the protected VM on success,
646 * negative error code on failure.
647 */
__pkvm_init_shadow(struct kvm * kvm,void * shadow_va,size_t shadow_size,void * pgd)648 int __pkvm_init_shadow(struct kvm *kvm,
649 void *shadow_va,
650 size_t shadow_size,
651 void *pgd)
652 {
653 struct kvm_shadow_vm *vm = kern_hyp_va(shadow_va);
654 phys_addr_t shadow_pa = hyp_virt_to_phys(vm);
655 u64 pfn = hyp_phys_to_pfn(shadow_pa);
656 u64 nr_shadow_pages = shadow_size >> PAGE_SHIFT;
657 u64 nr_pgd_pages;
658 size_t pgd_size;
659 int nr_vcpus = 0;
660 int ret = 0;
661
662 /* Check that the donated memory is aligned to page boundaries. */
663 if (!PAGE_ALIGNED(shadow_va) ||
664 !PAGE_ALIGNED(shadow_size) ||
665 !PAGE_ALIGNED(pgd))
666 return -EINVAL;
667
668 kvm = kern_hyp_va(kvm);
669 pgd = kern_hyp_va(pgd);
670
671 ret = hyp_pin_shared_mem(kvm, kvm + 1);
672 if (ret)
673 return ret;
674
675 /* Ensure the host has donated enough memory for the shadow structs. */
676 nr_vcpus = kvm->created_vcpus;
677 ret = check_shadow_size(nr_vcpus, shadow_size);
678 if (ret)
679 goto err;
680
681 ret = __pkvm_host_donate_hyp(pfn, nr_shadow_pages);
682 if (ret)
683 goto err;
684
685 /* Ensure we're working with a clean slate. */
686 memset(vm, 0, shadow_size);
687
688 vm->arch.vtcr = host_kvm.arch.vtcr;
689 pgd_size = kvm_pgtable_stage2_pgd_size(host_kvm.arch.vtcr);
690 nr_pgd_pages = pgd_size >> PAGE_SHIFT;
691 ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn(pgd), nr_pgd_pages);
692 if (ret)
693 goto err_remove_mappings;
694
695 init_shadow_vm(kvm, vm, nr_vcpus);
696
697 /* Add the entry to the shadow table. */
698 hyp_spin_lock(&shadow_lock);
699 ret = insert_shadow_table(kvm, vm, shadow_size);
700 if (ret < 0)
701 goto err_unlock;
702
703 ret = kvm_guest_prepare_stage2(vm, pgd);
704 if (ret)
705 goto err_remove_shadow_table;
706
707 hyp_spin_unlock(&shadow_lock);
708 return vm->shadow_handle;
709
710 err_remove_shadow_table:
711 remove_shadow_table(vm->shadow_handle);
712 err_unlock:
713 hyp_spin_unlock(&shadow_lock);
714 WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(pgd), nr_pgd_pages));
715 err_remove_mappings:
716 /* Clear the donated shadow memory on failure to avoid data leaks. */
717 memset(vm, 0, shadow_size);
718 WARN_ON(__pkvm_hyp_donate_host(hyp_phys_to_pfn(shadow_pa),
719 shadow_size >> PAGE_SHIFT));
720 err:
721 hyp_unpin_shared_mem(kvm, kvm + 1);
722 return ret;
723 }
724
725 /*
726 * Initialize the protected vcpu state shadow copy in host-donated memory.
727 *
728 * shadow_handle: The handle for the protected vm.
729 * host_vcpu: A pointer to the corresponding host vcpu (host va).
730 * shadow_vcpu_hva: The host va of the area being donated for the vcpu state.
731 * Must be page aligned. The size of the area must be equal to
732 * the paged-aligned size of kvm_shadow_vcpu_state.
733 *
734 * Return 0 on success, negative error code on failure.
735 */
__pkvm_init_shadow_vcpu(unsigned int shadow_handle,struct kvm_vcpu * host_vcpu,void * shadow_vcpu_hva)736 int __pkvm_init_shadow_vcpu(unsigned int shadow_handle,
737 struct kvm_vcpu *host_vcpu,
738 void *shadow_vcpu_hva)
739 {
740 struct kvm_shadow_vm *vm;
741 struct shadow_vcpu_state *shadow_state = kern_hyp_va(shadow_vcpu_hva);
742 size_t vcpu_state_sz = sizeof(*shadow_state);
743 u64 nr_pages = PAGE_ALIGN(vcpu_state_sz) >> PAGE_SHIFT;
744 unsigned int idx;
745 int ret;
746
747 if (!PAGE_ALIGNED(shadow_vcpu_hva))
748 return -EINVAL;
749
750 ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn(shadow_state),
751 nr_pages);
752 if (ret)
753 return ret;
754
755 memset(shadow_state, 0, vcpu_state_sz);
756
757 hyp_spin_lock(&shadow_lock);
758
759 vm = find_shadow_by_handle(shadow_handle);
760 if (!vm) {
761 ret = -ENOENT;
762 goto unlock;
763 }
764
765 idx = vm->nr_vcpus;
766 if (idx >= vm->created_vcpus) {
767 ret = -EINVAL;
768 goto unlock;
769 }
770
771 ret = init_shadow_vcpu(shadow_state, host_vcpu, vm, idx);
772 if (ret)
773 goto unlock;
774
775 vm->shadow_vcpus[idx] = shadow_state;
776 vm->nr_vcpus++;
777 unlock:
778 hyp_spin_unlock(&shadow_lock);
779
780 if (ret) {
781 memset(shadow_state, 0, vcpu_state_sz);
782 WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(shadow_state),
783 nr_pages));
784 }
785
786 return ret;
787 }
788
teardown_donated_memory(struct kvm_hyp_memcache * mc,void * addr,size_t size)789 static void teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr,
790 size_t size)
791 {
792 u64 pfn = hyp_phys_to_pfn(__hyp_pa(addr));
793 u64 nr_pages = size >> PAGE_SHIFT;
794 void *start;
795
796 memset(addr, 0, size);
797 kvm_flush_dcache_to_poc(addr, size);
798
799 for (start = addr; start < addr + size; start += PAGE_SIZE)
800 push_hyp_memcache(mc, start, hyp_virt_to_phys);
801
802 WARN_ON(__pkvm_hyp_donate_host(pfn, nr_pages));
803 }
804
__pkvm_teardown_shadow(int shadow_handle)805 int __pkvm_teardown_shadow(int shadow_handle)
806 {
807 struct kvm_hyp_memcache *mc;
808 struct kvm_shadow_vm *vm;
809 struct kvm *host_kvm;
810 unsigned int nr_vcpus;
811 int err;
812 int i;
813
814 /* Lookup then remove entry from the shadow table. */
815 hyp_spin_lock(&shadow_lock);
816 vm = find_shadow_by_handle(shadow_handle);
817 if (!vm) {
818 err = -ENOENT;
819 goto err_unlock;
820 }
821
822 if (WARN_ON(hyp_page_count(vm))) {
823 err = -EBUSY;
824 goto err_unlock;
825 }
826
827 host_kvm = vm->host_kvm;
828 nr_vcpus = vm->nr_vcpus;
829
830 /*
831 * Clear the tracking for last_loaded_vcpu for all cpus for this vm in
832 * case the same addresses for those vcpus are reused for future vms.
833 */
834 for (i = 0; i < hyp_nr_cpus; i++) {
835 struct kvm_vcpu **last_loaded_vcpu_ptr =
836 per_cpu_ptr(&last_loaded_vcpu, i);
837 struct kvm_vcpu *vcpu = *last_loaded_vcpu_ptr;
838
839 if (vcpu && vcpu->arch.pkvm.shadow_vm == vm)
840 *last_loaded_vcpu_ptr = NULL;
841 }
842
843 /* Ensure the VMID is clean before it can be reallocated */
844 __kvm_tlb_flush_vmid(&vm->arch.mmu);
845 remove_shadow_table(shadow_handle);
846 hyp_spin_unlock(&shadow_lock);
847
848 /* Reclaim guest pages, and page-table pages */
849 mc = &host_kvm->arch.pkvm.teardown_mc;
850 reclaim_guest_pages(vm, mc);
851 drain_shadow_vcpus(vm->shadow_vcpus, nr_vcpus, mc);
852 unpin_host_vcpus(vm->shadow_vcpus, nr_vcpus);
853
854 for (i = 0; i < nr_vcpus; i++)
855 teardown_donated_memory(mc, vm->shadow_vcpus[i],
856 PAGE_ALIGN(sizeof(vm->shadow_vcpus[i])));
857 teardown_donated_memory(mc, vm, vm->shadow_area_size);
858
859 hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
860 return 0;
861
862 err_unlock:
863 hyp_spin_unlock(&shadow_lock);
864 return err;
865 }
866
pkvm_load_pvmfw_pages(struct kvm_shadow_vm * vm,u64 ipa,phys_addr_t phys,u64 size)867 int pkvm_load_pvmfw_pages(struct kvm_shadow_vm *vm, u64 ipa, phys_addr_t phys,
868 u64 size)
869 {
870 struct kvm_protected_vm *pkvm = &vm->arch.pkvm;
871 u64 npages, offset = ipa - pkvm->pvmfw_load_addr;
872 void *src = hyp_phys_to_virt(pvmfw_base) + offset;
873
874 if (offset >= pvmfw_size)
875 return -EINVAL;
876
877 size = min(size, pvmfw_size - offset);
878 if (!PAGE_ALIGNED(size) || !PAGE_ALIGNED(src))
879 return -EINVAL;
880
881 npages = size >> PAGE_SHIFT;
882 while (npages--) {
883 void *dst;
884
885 dst = hyp_fixmap_map(phys);
886 if (!dst)
887 return -EINVAL;
888
889 /*
890 * No need for cache maintenance here, as the pgtable code will
891 * take care of this when installing the pte in the guest's
892 * stage-2 page table.
893 */
894 memcpy(dst, src, PAGE_SIZE);
895
896 hyp_fixmap_unmap();
897 src += PAGE_SIZE;
898 phys += PAGE_SIZE;
899 }
900
901 return 0;
902 }
903
pkvm_clear_pvmfw_pages(void)904 void pkvm_clear_pvmfw_pages(void)
905 {
906 void *addr = hyp_phys_to_virt(pvmfw_base);
907
908 memset(addr, 0, pvmfw_size);
909 kvm_flush_dcache_to_poc(addr, pvmfw_size);
910 }
911
912 /*
913 * This function sets the registers on the vcpu to their architecturally defined
914 * reset values.
915 *
916 * Note: Can only be called by the vcpu on itself, after it has been turned on.
917 */
pkvm_reset_vcpu(struct kvm_vcpu * vcpu)918 void pkvm_reset_vcpu(struct kvm_vcpu *vcpu)
919 {
920 struct vcpu_reset_state *reset_state = &vcpu->arch.reset_state;
921 struct kvm_shadow_vm *vm = vcpu->arch.pkvm.shadow_vm;
922
923 WARN_ON(!reset_state->reset);
924
925 init_ptrauth(vcpu);
926
927 /* Reset core registers */
928 memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
929 memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs));
930 vcpu_gp_regs(vcpu)->pstate = VCPU_RESET_PSTATE_EL1;
931
932 /* Reset system registers */
933 kvm_reset_pvm_sys_regs(vcpu);
934
935 /* Propagate initiator's endianness, after kvm_reset_pvm_sys_regs. */
936 if (reset_state->be)
937 kvm_vcpu_set_be(vcpu);
938
939 if (vm->pvmfw_entry_vcpu == vcpu) {
940 struct kvm_vcpu *host_vcpu = vcpu->arch.pkvm.host_vcpu;
941 u64 entry = vm->arch.pkvm.pvmfw_load_addr;
942 int i;
943
944 /* X0 - X14 provided by the VMM (preserved) */
945 for (i = 0; i <= 14; ++i)
946 vcpu_set_reg(vcpu, i, vcpu_get_reg(host_vcpu, i));
947
948 /* X15: Boot protocol version */
949 vcpu_set_reg(vcpu, 15, 0);
950
951 /* PC: IPA of pvmfw base */
952 *vcpu_pc(vcpu) = entry;
953
954 vm->pvmfw_entry_vcpu = NULL;
955
956 /* Auto enroll MMIO guard */
957 set_bit(KVM_ARCH_FLAG_MMIO_GUARD,
958 &vcpu->arch.pkvm.shadow_vm->arch.flags);
959 } else {
960 *vcpu_pc(vcpu) = reset_state->pc;
961 vcpu_set_reg(vcpu, 0, reset_state->r0);
962 }
963
964 reset_state->reset = false;
965
966 vcpu->arch.pkvm.exit_code = 0;
967
968 WARN_ON(vcpu->arch.pkvm.power_state != PSCI_0_2_AFFINITY_LEVEL_ON_PENDING);
969 WRITE_ONCE(vcpu->arch.power_off, false);
970 WRITE_ONCE(vcpu->arch.pkvm.power_state, PSCI_0_2_AFFINITY_LEVEL_ON);
971 }
972
pvm_mpidr_to_vcpu(struct kvm_shadow_vm * vm,unsigned long mpidr)973 struct kvm_vcpu *pvm_mpidr_to_vcpu(struct kvm_shadow_vm *vm, unsigned long mpidr)
974 {
975 struct kvm_vcpu *vcpu;
976 int i;
977
978 mpidr &= MPIDR_HWID_BITMASK;
979
980 for (i = 0; i < READ_ONCE(vm->nr_vcpus); i++) {
981 vcpu = vm->vcpus[i];
982
983 if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
984 return vcpu;
985 }
986
987 return NULL;
988 }
989
990 /*
991 * Returns true if the hypervisor handled PSCI call, and control should go back
992 * to the guest, or false if the host needs to do some additional work (i.e.,
993 * wake up the vcpu).
994 */
pvm_psci_vcpu_on(struct kvm_vcpu * source_vcpu)995 static bool pvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
996 {
997 struct kvm_shadow_vm *vm = source_vcpu->arch.pkvm.shadow_vm;
998 struct kvm_vcpu *vcpu;
999 struct vcpu_reset_state *reset_state;
1000 unsigned long cpu_id;
1001 unsigned long hvc_ret_val;
1002 int power_state;
1003
1004 cpu_id = smccc_get_arg1(source_vcpu);
1005 if (!kvm_psci_valid_affinity(source_vcpu, cpu_id)) {
1006 hvc_ret_val = PSCI_RET_INVALID_PARAMS;
1007 goto error;
1008 }
1009
1010 vcpu = pvm_mpidr_to_vcpu(vm, cpu_id);
1011
1012 /* Make sure the caller requested a valid vcpu. */
1013 if (!vcpu) {
1014 hvc_ret_val = PSCI_RET_INVALID_PARAMS;
1015 goto error;
1016 }
1017
1018 /*
1019 * Make sure the requested vcpu is not on to begin with.
1020 * Atomic to avoid race between vcpus trying to power on the same vcpu.
1021 */
1022 power_state = cmpxchg(&vcpu->arch.pkvm.power_state,
1023 PSCI_0_2_AFFINITY_LEVEL_OFF,
1024 PSCI_0_2_AFFINITY_LEVEL_ON_PENDING);
1025 switch (power_state) {
1026 case PSCI_0_2_AFFINITY_LEVEL_ON_PENDING:
1027 hvc_ret_val = PSCI_RET_ON_PENDING;
1028 goto error;
1029 case PSCI_0_2_AFFINITY_LEVEL_ON:
1030 hvc_ret_val = PSCI_RET_ALREADY_ON;
1031 goto error;
1032 case PSCI_0_2_AFFINITY_LEVEL_OFF:
1033 break;
1034 default:
1035 hvc_ret_val = PSCI_RET_INTERNAL_FAILURE;
1036 goto error;
1037 }
1038
1039 reset_state = &vcpu->arch.reset_state;
1040
1041 reset_state->pc = smccc_get_arg2(source_vcpu);
1042 reset_state->r0 = smccc_get_arg3(source_vcpu);
1043
1044 /* Propagate caller endianness */
1045 reset_state->be = kvm_vcpu_is_be(source_vcpu);
1046
1047 reset_state->reset = true;
1048
1049 /*
1050 * Return to the host, which should make the KVM_REQ_VCPU_RESET request
1051 * as well as kvm_vcpu_wake_up() to schedule the vcpu.
1052 */
1053 return false;
1054
1055 error:
1056 /* If there's an error go back straight to the guest. */
1057 smccc_set_retval(source_vcpu, hvc_ret_val, 0, 0, 0);
1058 return true;
1059 }
1060
pvm_psci_vcpu_affinity_info(struct kvm_vcpu * vcpu)1061 static bool pvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
1062 {
1063 int i, matching_cpus = 0;
1064 unsigned long mpidr;
1065 unsigned long target_affinity;
1066 unsigned long target_affinity_mask;
1067 unsigned long lowest_affinity_level;
1068 struct kvm_shadow_vm *vm = vcpu->arch.pkvm.shadow_vm;
1069 struct kvm_vcpu *tmp;
1070 unsigned long hvc_ret_val;
1071
1072 target_affinity = smccc_get_arg1(vcpu);
1073 lowest_affinity_level = smccc_get_arg2(vcpu);
1074
1075 if (!kvm_psci_valid_affinity(vcpu, target_affinity)) {
1076 hvc_ret_val = PSCI_RET_INVALID_PARAMS;
1077 goto done;
1078 }
1079
1080 /* Determine target affinity mask */
1081 target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
1082 if (!target_affinity_mask) {
1083 hvc_ret_val = PSCI_RET_INVALID_PARAMS;
1084 goto done;
1085 }
1086
1087 /* Ignore other bits of target affinity */
1088 target_affinity &= target_affinity_mask;
1089
1090 hvc_ret_val = PSCI_0_2_AFFINITY_LEVEL_OFF;
1091
1092 /*
1093 * If at least one vcpu matching target affinity is ON then return ON,
1094 * then if at least one is PENDING_ON then return PENDING_ON.
1095 * Otherwise, return OFF.
1096 */
1097 for (i = 0; i < READ_ONCE(vm->nr_vcpus); i++) {
1098 tmp = vm->vcpus[i];
1099 mpidr = kvm_vcpu_get_mpidr_aff(tmp);
1100
1101 if ((mpidr & target_affinity_mask) == target_affinity) {
1102 int power_state;
1103
1104 matching_cpus++;
1105 power_state = READ_ONCE(tmp->arch.pkvm.power_state);
1106 switch (power_state) {
1107 case PSCI_0_2_AFFINITY_LEVEL_ON_PENDING:
1108 hvc_ret_val = PSCI_0_2_AFFINITY_LEVEL_ON_PENDING;
1109 break;
1110 case PSCI_0_2_AFFINITY_LEVEL_ON:
1111 hvc_ret_val = PSCI_0_2_AFFINITY_LEVEL_ON;
1112 goto done;
1113 case PSCI_0_2_AFFINITY_LEVEL_OFF:
1114 break;
1115 default:
1116 hvc_ret_val = PSCI_RET_INTERNAL_FAILURE;
1117 goto done;
1118 }
1119 }
1120 }
1121
1122 if (!matching_cpus)
1123 hvc_ret_val = PSCI_RET_INVALID_PARAMS;
1124
1125 done:
1126 /* Nothing to be handled by the host. Go back to the guest. */
1127 smccc_set_retval(vcpu, hvc_ret_val, 0, 0, 0);
1128 return true;
1129 }
1130
1131 /*
1132 * Returns true if the hypervisor has handled the PSCI call, and control should
1133 * go back to the guest, or false if the host needs to do some additional work
1134 * (e.g., turn off and update vcpu scheduling status).
1135 */
pvm_psci_vcpu_off(struct kvm_vcpu * vcpu)1136 static bool pvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
1137 {
1138 WARN_ON(vcpu->arch.power_off);
1139 WARN_ON(vcpu->arch.pkvm.power_state != PSCI_0_2_AFFINITY_LEVEL_ON);
1140
1141 WRITE_ONCE(vcpu->arch.power_off, true);
1142 WRITE_ONCE(vcpu->arch.pkvm.power_state, PSCI_0_2_AFFINITY_LEVEL_OFF);
1143
1144 /* Return to the host so that it can finish powering off the vcpu. */
1145 return false;
1146 }
1147
pvm_psci_version(struct kvm_vcpu * vcpu)1148 static bool pvm_psci_version(struct kvm_vcpu *vcpu)
1149 {
1150 /* Nothing to be handled by the host. Go back to the guest. */
1151 smccc_set_retval(vcpu, KVM_ARM_PSCI_1_1, 0, 0, 0);
1152 return true;
1153 }
1154
pvm_psci_not_supported(struct kvm_vcpu * vcpu)1155 static bool pvm_psci_not_supported(struct kvm_vcpu *vcpu)
1156 {
1157 /* Nothing to be handled by the host. Go back to the guest. */
1158 smccc_set_retval(vcpu, PSCI_RET_NOT_SUPPORTED, 0, 0, 0);
1159 return true;
1160 }
1161
pvm_psci_features(struct kvm_vcpu * vcpu)1162 static bool pvm_psci_features(struct kvm_vcpu *vcpu)
1163 {
1164 u32 feature = smccc_get_arg1(vcpu);
1165 unsigned long val;
1166
1167 switch (feature) {
1168 case PSCI_0_2_FN_PSCI_VERSION:
1169 case PSCI_0_2_FN_CPU_SUSPEND:
1170 case PSCI_0_2_FN64_CPU_SUSPEND:
1171 case PSCI_0_2_FN_CPU_OFF:
1172 case PSCI_0_2_FN_CPU_ON:
1173 case PSCI_0_2_FN64_CPU_ON:
1174 case PSCI_0_2_FN_AFFINITY_INFO:
1175 case PSCI_0_2_FN64_AFFINITY_INFO:
1176 case PSCI_0_2_FN_SYSTEM_OFF:
1177 case PSCI_0_2_FN_SYSTEM_RESET:
1178 case PSCI_1_0_FN_PSCI_FEATURES:
1179 case PSCI_1_1_FN_SYSTEM_RESET2:
1180 case PSCI_1_1_FN64_SYSTEM_RESET2:
1181 case ARM_SMCCC_VERSION_FUNC_ID:
1182 val = PSCI_RET_SUCCESS;
1183 break;
1184 default:
1185 val = PSCI_RET_NOT_SUPPORTED;
1186 break;
1187 }
1188
1189 /* Nothing to be handled by the host. Go back to the guest. */
1190 smccc_set_retval(vcpu, val, 0, 0, 0);
1191 return true;
1192 }
1193
pkvm_handle_psci(struct kvm_vcpu * vcpu)1194 static bool pkvm_handle_psci(struct kvm_vcpu *vcpu)
1195 {
1196 u32 psci_fn = smccc_get_function(vcpu);
1197
1198 switch (psci_fn) {
1199 case PSCI_0_2_FN_CPU_ON:
1200 kvm_psci_narrow_to_32bit(vcpu);
1201 fallthrough;
1202 case PSCI_0_2_FN64_CPU_ON:
1203 return pvm_psci_vcpu_on(vcpu);
1204 case PSCI_0_2_FN_CPU_OFF:
1205 return pvm_psci_vcpu_off(vcpu);
1206 case PSCI_0_2_FN_AFFINITY_INFO:
1207 kvm_psci_narrow_to_32bit(vcpu);
1208 fallthrough;
1209 case PSCI_0_2_FN64_AFFINITY_INFO:
1210 return pvm_psci_vcpu_affinity_info(vcpu);
1211 case PSCI_0_2_FN_PSCI_VERSION:
1212 return pvm_psci_version(vcpu);
1213 case PSCI_1_0_FN_PSCI_FEATURES:
1214 return pvm_psci_features(vcpu);
1215 case PSCI_0_2_FN_SYSTEM_RESET:
1216 case PSCI_0_2_FN_CPU_SUSPEND:
1217 case PSCI_0_2_FN64_CPU_SUSPEND:
1218 case PSCI_0_2_FN_SYSTEM_OFF:
1219 case PSCI_1_1_FN_SYSTEM_RESET2:
1220 case PSCI_1_1_FN64_SYSTEM_RESET2:
1221 return false; /* Handled by the host. */
1222 default:
1223 break;
1224 }
1225
1226 return pvm_psci_not_supported(vcpu);
1227 }
1228
__pkvm_memshare_page_req(struct kvm_vcpu * vcpu,u64 ipa)1229 static u64 __pkvm_memshare_page_req(struct kvm_vcpu *vcpu, u64 ipa)
1230 {
1231 u64 elr;
1232
1233 /* Fake up a data abort (Level 3 translation fault on write) */
1234 vcpu->arch.fault.esr_el2 = (u32)ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT |
1235 ESR_ELx_WNR | ESR_ELx_FSC_FAULT |
1236 FIELD_PREP(ESR_ELx_FSC_LEVEL, 3);
1237
1238 /* Shuffle the IPA around into the HPFAR */
1239 vcpu->arch.fault.hpfar_el2 = (ipa >> 8) & HPFAR_MASK;
1240
1241 /* This is a virtual address. 0's good. Let's go with 0. */
1242 vcpu->arch.fault.far_el2 = 0;
1243
1244 /* Rewind the ELR so we return to the HVC once the IPA is mapped */
1245 elr = read_sysreg(elr_el2);
1246 elr -=4;
1247 write_sysreg(elr, elr_el2);
1248
1249 return ARM_EXCEPTION_TRAP;
1250 }
1251
pkvm_memshare_call(struct kvm_vcpu * vcpu,u64 * exit_code)1252 static bool pkvm_memshare_call(struct kvm_vcpu *vcpu, u64 *exit_code)
1253 {
1254 u64 ipa = smccc_get_arg1(vcpu);
1255 u64 arg2 = smccc_get_arg2(vcpu);
1256 u64 arg3 = smccc_get_arg3(vcpu);
1257 int err;
1258
1259 if (arg2 || arg3)
1260 goto out_guest_err;
1261
1262 err = __pkvm_guest_share_host(vcpu, ipa);
1263 switch (err) {
1264 case 0:
1265 /* Success! Now tell the host. */
1266 goto out_host;
1267 case -EFAULT:
1268 /*
1269 * Convert the exception into a data abort so that the page
1270 * being shared is mapped into the guest next time.
1271 */
1272 *exit_code = __pkvm_memshare_page_req(vcpu, ipa);
1273 goto out_host;
1274 }
1275
1276 out_guest_err:
1277 smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0);
1278 return true;
1279
1280 out_host:
1281 return false;
1282 }
1283
pkvm_memunshare_call(struct kvm_vcpu * vcpu)1284 static bool pkvm_memunshare_call(struct kvm_vcpu *vcpu)
1285 {
1286 u64 ipa = smccc_get_arg1(vcpu);
1287 u64 arg2 = smccc_get_arg2(vcpu);
1288 u64 arg3 = smccc_get_arg3(vcpu);
1289 int err;
1290
1291 if (arg2 || arg3)
1292 goto out_guest_err;
1293
1294 err = __pkvm_guest_unshare_host(vcpu, ipa);
1295 if (err)
1296 goto out_guest_err;
1297
1298 return false;
1299
1300 out_guest_err:
1301 smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0);
1302 return true;
1303 }
1304
pkvm_install_ioguard_page(struct kvm_vcpu * vcpu,u64 * exit_code)1305 static bool pkvm_install_ioguard_page(struct kvm_vcpu *vcpu, u64 *exit_code)
1306 {
1307 u64 retval = SMCCC_RET_SUCCESS;
1308 u64 ipa = smccc_get_arg1(vcpu);
1309 int ret;
1310
1311 ret = __pkvm_install_ioguard_page(vcpu, ipa);
1312 if (ret == -ENOMEM) {
1313 /*
1314 * We ran out of memcache, let's ask for more. Cancel
1315 * the effects of the HVC that took us here, and
1316 * forward the hypercall to the host for page donation
1317 * purposes.
1318 */
1319 write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR);
1320 return false;
1321 }
1322
1323 if (ret)
1324 retval = SMCCC_RET_INVALID_PARAMETER;
1325
1326 smccc_set_retval(vcpu, retval, 0, 0, 0);
1327 return true;
1328 }
1329
1330 bool smccc_trng_available;
1331
pkvm_forward_trng(struct kvm_vcpu * vcpu)1332 static bool pkvm_forward_trng(struct kvm_vcpu *vcpu)
1333 {
1334 u32 fn = smccc_get_function(vcpu);
1335 struct arm_smccc_res res;
1336 unsigned long arg1 = 0;
1337
1338 /*
1339 * Forward TRNG calls to EL3, as we can't trust the host to handle
1340 * these for us.
1341 */
1342 switch (fn) {
1343 case ARM_SMCCC_TRNG_FEATURES:
1344 case ARM_SMCCC_TRNG_RND32:
1345 case ARM_SMCCC_TRNG_RND64:
1346 arg1 = smccc_get_arg1(vcpu);
1347 fallthrough;
1348 case ARM_SMCCC_TRNG_VERSION:
1349 case ARM_SMCCC_TRNG_GET_UUID:
1350 arm_smccc_1_1_smc(fn, arg1, &res);
1351 smccc_set_retval(vcpu, res.a0, res.a1, res.a2, res.a3);
1352 memzero_explicit(&res, sizeof(res));
1353 break;
1354 }
1355
1356 return true;
1357 }
1358
1359 /*
1360 * Handler for protected VM HVC calls.
1361 *
1362 * Returns true if the hypervisor has handled the exit, and control should go
1363 * back to the guest, or false if it hasn't.
1364 */
kvm_handle_pvm_hvc64(struct kvm_vcpu * vcpu,u64 * exit_code)1365 bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
1366 {
1367 u32 fn = smccc_get_function(vcpu);
1368 u64 val[4] = { SMCCC_RET_NOT_SUPPORTED };
1369
1370 switch (fn) {
1371 case ARM_SMCCC_VERSION_FUNC_ID:
1372 /* Nothing to be handled by the host. Go back to the guest. */
1373 val[0] = ARM_SMCCC_VERSION_1_1;
1374 break;
1375 case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
1376 val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
1377 val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
1378 val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
1379 val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
1380 break;
1381 case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
1382 val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
1383 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO);
1384 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE);
1385 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE);
1386 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO);
1387 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL);
1388 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP);
1389 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP);
1390 break;
1391 case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID:
1392 set_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vcpu->arch.pkvm.shadow_vm->arch.flags);
1393 val[0] = SMCCC_RET_SUCCESS;
1394 break;
1395 case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID:
1396 return pkvm_install_ioguard_page(vcpu, exit_code);
1397 case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID:
1398 if (__pkvm_remove_ioguard_page(vcpu, vcpu_get_reg(vcpu, 1)))
1399 val[0] = SMCCC_RET_INVALID_PARAMETER;
1400 else
1401 val[0] = SMCCC_RET_SUCCESS;
1402 break;
1403 case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID:
1404 case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID:
1405 if (smccc_get_arg1(vcpu) ||
1406 smccc_get_arg2(vcpu) ||
1407 smccc_get_arg3(vcpu)) {
1408 val[0] = SMCCC_RET_INVALID_PARAMETER;
1409 } else {
1410 val[0] = PAGE_SIZE;
1411 }
1412 break;
1413 case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
1414 return pkvm_memshare_call(vcpu, exit_code);
1415 case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
1416 return pkvm_memunshare_call(vcpu);
1417 case ARM_SMCCC_TRNG_VERSION ... ARM_SMCCC_TRNG_RND32:
1418 case ARM_SMCCC_TRNG_RND64:
1419 if (smccc_trng_available)
1420 return pkvm_forward_trng(vcpu);
1421 break;
1422 default:
1423 return pkvm_handle_psci(vcpu);
1424 }
1425
1426 smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
1427 return true;
1428 }
1429