1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2021 Google LLC
4 * Author: Fuad Tabba <tabba@google.com>
5 */
6
7 #include <asm/kvm_arm.h>
8 #include <asm/kvm_asm.h>
9 #include <asm/kvm_host.h>
10 #include <asm/kvm_mmu.h>
11 #include <asm/memory.h>
12
13 #include <linux/kvm_host.h>
14 #include <linux/mm.h>
15
16 #include <kvm/arm_hypercalls.h>
17 #include <kvm/arm_psci.h>
18
19 #include <nvhe/mem_protect.h>
20 #include <nvhe/mm.h>
21 #include <nvhe/pkvm.h>
22 #include <nvhe/trap_handler.h>
23
24 /* Used by icache_is_vpipt(). */
25 unsigned long __icache_flags;
26
27 /*
28 * Set trap register values based on features in ID_AA64PFR0.
29 */
pvm_init_traps_aa64pfr0(struct kvm_vcpu * vcpu)30 static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
31 {
32 const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1);
33 u64 hcr_set = HCR_RW;
34 u64 hcr_clear = 0;
35 u64 cptr_set = 0;
36
37 /* Protected KVM does not support AArch32 guests. */
38 BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL0),
39 PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_ELx_64BIT_ONLY);
40 BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1),
41 PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_ELx_64BIT_ONLY);
42
43 /*
44 * Linux guests assume support for floating-point and Advanced SIMD. Do
45 * not change the trapping behavior for these from the KVM default.
46 */
47 BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_FP),
48 PVM_ID_AA64PFR0_ALLOW));
49 BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD),
50 PVM_ID_AA64PFR0_ALLOW));
51
52 /* Trap RAS unless all current versions are supported */
53 if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_RAS), feature_ids) <
54 ID_AA64PFR0_RAS_V1P1) {
55 hcr_set |= HCR_TERR | HCR_TEA;
56 hcr_clear |= HCR_FIEN;
57 }
58
59 /* Trap AMU */
60 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_AMU), feature_ids)) {
61 hcr_clear |= HCR_AMVOFFEN;
62 cptr_set |= CPTR_EL2_TAM;
63 }
64
65 /* Trap SVE */
66 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_SVE), feature_ids))
67 cptr_set |= CPTR_EL2_TZ;
68
69 vcpu->arch.hcr_el2 |= hcr_set;
70 vcpu->arch.hcr_el2 &= ~hcr_clear;
71 vcpu->arch.cptr_el2 |= cptr_set;
72 }
73
74 /*
75 * Set trap register values based on features in ID_AA64PFR1.
76 */
pvm_init_traps_aa64pfr1(struct kvm_vcpu * vcpu)77 static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu)
78 {
79 const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1);
80 u64 hcr_set = 0;
81 u64 hcr_clear = 0;
82
83 /* Memory Tagging: Trap and Treat as Untagged if not supported. */
84 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), feature_ids)) {
85 hcr_set |= HCR_TID5;
86 hcr_clear |= HCR_DCT | HCR_ATA;
87 }
88
89 vcpu->arch.hcr_el2 |= hcr_set;
90 vcpu->arch.hcr_el2 &= ~hcr_clear;
91 }
92
93 /*
94 * Set trap register values based on features in ID_AA64DFR0.
95 */
pvm_init_traps_aa64dfr0(struct kvm_vcpu * vcpu)96 static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu)
97 {
98 const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1);
99 u64 mdcr_set = 0;
100 u64 mdcr_clear = 0;
101 u64 cptr_set = 0;
102
103 /* Trap/constrain PMU */
104 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMUVER), feature_ids)) {
105 mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR;
106 mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME |
107 MDCR_EL2_HPMN_MASK;
108 }
109
110 /* Trap Debug */
111 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), feature_ids))
112 mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA;
113
114 /* Trap OS Double Lock */
115 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DOUBLELOCK), feature_ids))
116 mdcr_set |= MDCR_EL2_TDOSA;
117
118 /* Trap SPE */
119 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER), feature_ids)) {
120 mdcr_set |= MDCR_EL2_TPMS;
121 mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
122 }
123
124 /* Trap Trace Filter */
125 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_TRACE_FILT), feature_ids))
126 mdcr_set |= MDCR_EL2_TTRF;
127
128 /* Trap Trace */
129 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_TRACEVER), feature_ids))
130 cptr_set |= CPTR_EL2_TTA;
131
132 vcpu->arch.mdcr_el2 |= mdcr_set;
133 vcpu->arch.mdcr_el2 &= ~mdcr_clear;
134 vcpu->arch.cptr_el2 |= cptr_set;
135 }
136
137 /*
138 * Set trap register values based on features in ID_AA64MMFR0.
139 */
pvm_init_traps_aa64mmfr0(struct kvm_vcpu * vcpu)140 static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu)
141 {
142 const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1);
143 u64 mdcr_set = 0;
144
145 /* Trap Debug Communications Channel registers */
146 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_FGT), feature_ids))
147 mdcr_set |= MDCR_EL2_TDCC;
148
149 vcpu->arch.mdcr_el2 |= mdcr_set;
150 }
151
152 /*
153 * Set trap register values based on features in ID_AA64MMFR1.
154 */
pvm_init_traps_aa64mmfr1(struct kvm_vcpu * vcpu)155 static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu)
156 {
157 const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1);
158 u64 hcr_set = 0;
159
160 /* Trap LOR */
161 if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_LOR), feature_ids))
162 hcr_set |= HCR_TLOR;
163
164 vcpu->arch.hcr_el2 |= hcr_set;
165 }
166
167 /*
168 * Set baseline trap register values.
169 */
pvm_init_trap_regs(struct kvm_vcpu * vcpu)170 static void pvm_init_trap_regs(struct kvm_vcpu *vcpu)
171 {
172 vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
173 vcpu->arch.mdcr_el2 = 0;
174
175 /*
176 * Always trap:
177 * - Feature id registers: to control features exposed to guests
178 * - Implementation-defined features
179 */
180 vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS |
181 HCR_TID3 | HCR_TACR | HCR_TIDCP | HCR_TID1;
182
183 if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) {
184 /* route synchronous external abort exceptions to EL2 */
185 vcpu->arch.hcr_el2 |= HCR_TEA;
186 /* trap error record accesses */
187 vcpu->arch.hcr_el2 |= HCR_TERR;
188 }
189
190 if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
191 vcpu->arch.hcr_el2 |= HCR_FWB;
192
193 if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE))
194 vcpu->arch.hcr_el2 |= HCR_TID2;
195 }
196
197 /*
198 * Initialize trap register values for protected VMs.
199 */
pkvm_vcpu_init_traps(struct kvm_vcpu * vcpu)200 static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
201 {
202 pvm_init_trap_regs(vcpu);
203 pvm_init_traps_aa64pfr0(vcpu);
204 pvm_init_traps_aa64pfr1(vcpu);
205 pvm_init_traps_aa64dfr0(vcpu);
206 pvm_init_traps_aa64mmfr0(vcpu);
207 pvm_init_traps_aa64mmfr1(vcpu);
208 }
209
210 /*
211 * Start the shadow table handle at the offset defined instead of at 0.
212 * Mainly for sanity checking and debugging.
213 */
214 #define HANDLE_OFFSET 0x1000
215
shadow_handle_to_index(int shadow_handle)216 static int shadow_handle_to_index(int shadow_handle)
217 {
218 return shadow_handle - HANDLE_OFFSET;
219 }
220
index_to_shadow_handle(int index)221 static int index_to_shadow_handle(int index)
222 {
223 return index + HANDLE_OFFSET;
224 }
225
226 extern unsigned long hyp_nr_cpus;
227
228 /*
229 * Track the vcpu most recently loaded on each physical CPU.
230 */
231 static DEFINE_PER_CPU(struct kvm_vcpu *, last_loaded_vcpu);
232
233 /*
234 * Spinlock for protecting the shadow table related state.
235 * Protects writes to shadow_table, num_shadow_entries, and next_shadow_alloc,
236 * as well as reads and writes to last_shadow_vcpu_lookup.
237 */
238 static DEFINE_HYP_SPINLOCK(shadow_lock);
239
240 /*
241 * The table of shadow entries for protected VMs in hyp.
242 * Allocated at hyp initialization and setup.
243 */
244 static struct kvm_shadow_vm **shadow_table;
245
246 /* Current number of vms in the shadow table. */
247 static int num_shadow_entries;
248
249 /* The next entry index to try to allocate from. */
250 static int next_shadow_alloc;
251
hyp_shadow_table_init(void * tbl)252 void hyp_shadow_table_init(void *tbl)
253 {
254 WARN_ON(shadow_table);
255 shadow_table = tbl;
256 }
257
258 /*
259 * Return the shadow vm corresponding to the handle.
260 */
find_shadow_by_handle(int shadow_handle)261 static struct kvm_shadow_vm *find_shadow_by_handle(int shadow_handle)
262 {
263 int shadow_index = shadow_handle_to_index(shadow_handle);
264
265 if (unlikely(shadow_index < 0 || shadow_index >= KVM_MAX_PVMS))
266 return NULL;
267
268 return shadow_table[shadow_index];
269 }
270
get_shadow_vcpu(int shadow_handle,unsigned int vcpu_idx)271 struct kvm_vcpu *get_shadow_vcpu(int shadow_handle, unsigned int vcpu_idx)
272 {
273 struct kvm_vcpu *vcpu = NULL;
274 struct kvm_shadow_vm *vm;
275 bool flush_context = false;
276
277 hyp_spin_lock(&shadow_lock);
278 vm = find_shadow_by_handle(shadow_handle);
279 if (!vm || vm->nr_vcpus <= vcpu_idx)
280 goto unlock;
281 vcpu = &vm->shadow_vcpus[vcpu_idx]->vcpu;
282
283 /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */
284 if (unlikely(vcpu->arch.pkvm.loaded_on_cpu)) {
285 vcpu = NULL;
286 goto unlock;
287 }
288
289 /*
290 * Guarantee that both TLBs and I-cache are private to each vcpu.
291 * The check below is conservative and could lead to over-invalidation,
292 * because there is no need to nuke the contexts if the vcpu belongs to
293 * a different vm.
294 */
295 if (vcpu != __this_cpu_read(last_loaded_vcpu)) {
296 flush_context = true;
297 __this_cpu_write(last_loaded_vcpu, vcpu);
298 }
299
300 vcpu->arch.pkvm.loaded_on_cpu = true;
301
302 hyp_page_ref_inc(hyp_virt_to_page(vm));
303 unlock:
304 hyp_spin_unlock(&shadow_lock);
305
306 /* No need for the lock while flushing the context. */
307 if (flush_context)
308 __kvm_flush_cpu_context(vcpu->arch.hw_mmu);
309
310 return vcpu;
311 }
312
put_shadow_vcpu(struct kvm_vcpu * vcpu)313 void put_shadow_vcpu(struct kvm_vcpu *vcpu)
314 {
315 struct kvm_shadow_vm *vm = vcpu->arch.pkvm.shadow_vm;
316
317 hyp_spin_lock(&shadow_lock);
318 vcpu->arch.pkvm.loaded_on_cpu = false;
319 hyp_page_ref_dec(hyp_virt_to_page(vm));
320 hyp_spin_unlock(&shadow_lock);
321 }
322
323 /* Check and copy the supported features for the vcpu from the host. */
copy_features(struct kvm_vcpu * shadow_vcpu,struct kvm_vcpu * host_vcpu)324 static int copy_features(struct kvm_vcpu *shadow_vcpu, struct kvm_vcpu *host_vcpu)
325 {
326 DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES);
327
328 bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES);
329
330 /*
331 * Always allowed:
332 * - CPU starting in poweroff state
333 * - PSCI v0.2
334 */
335 set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features);
336 set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features);
337
338 /*
339 * Check if remaining features are allowed:
340 * - Performance Monitoring
341 * - Scalable Vectors
342 * - Pointer Authentication
343 */
344 if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMUVER), PVM_ID_AA64DFR0_ALLOW))
345 set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features);
346
347 if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_SVE), PVM_ID_AA64PFR0_ALLOW))
348 set_bit(KVM_ARM_VCPU_SVE, allowed_features);
349
350 if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_API), PVM_ID_AA64ISAR1_ALLOW) &&
351 FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA), PVM_ID_AA64ISAR1_ALLOW))
352 set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features);
353
354 if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI), PVM_ID_AA64ISAR1_ALLOW) &&
355 FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA), PVM_ID_AA64ISAR1_ALLOW))
356 set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features);
357
358 bitmap_and(shadow_vcpu->arch.features, host_vcpu->arch.features,
359 allowed_features, KVM_VCPU_MAX_FEATURES);
360
361 /*
362 * Check for system support for address/generic pointer authentication
363 * features if either are enabled.
364 */
365 if ((test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, shadow_vcpu->arch.features) ||
366 test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, shadow_vcpu->arch.features)) &&
367 !system_has_full_ptr_auth())
368 return -EINVAL;
369
370 return 0;
371 }
372
unpin_host_vcpu(struct shadow_vcpu_state * shadow_vcpu)373 static void unpin_host_vcpu(struct shadow_vcpu_state *shadow_vcpu)
374 {
375 struct kvm_vcpu *host_vcpu = shadow_vcpu->vcpu.arch.pkvm.host_vcpu;
376 size_t sve_state_size;
377 void *sve_state = shadow_vcpu->vcpu.arch.sve_state;
378
379 hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
380
381 if (!sve_state)
382 return;
383
384 sve_state = kern_hyp_va(sve_state);
385 sve_state_size = vcpu_sve_state_size(&shadow_vcpu->vcpu);
386 hyp_unpin_shared_mem(sve_state, sve_state + sve_state_size);
387 }
388
unpin_host_vcpus(struct shadow_vcpu_state * shadow_vcpus[],int nr_vcpus)389 static void unpin_host_vcpus(struct shadow_vcpu_state *shadow_vcpus[], int nr_vcpus)
390 {
391 int i;
392
393 for (i = 0; i < nr_vcpus; i++)
394 unpin_host_vcpu(shadow_vcpus[i]);
395 }
396
init_ptrauth(struct kvm_vcpu * shadow_vcpu)397 static int init_ptrauth(struct kvm_vcpu *shadow_vcpu)
398 {
399 int ret = 0;
400 if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, shadow_vcpu->arch.features) ||
401 test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, shadow_vcpu->arch.features))
402 ret = kvm_vcpu_enable_ptrauth(shadow_vcpu);
403 return ret;
404 }
405
init_shadow_vm(struct kvm * kvm,struct kvm_shadow_vm * vm,int nr_vcpus)406 static void init_shadow_vm(struct kvm *kvm, struct kvm_shadow_vm *vm,
407 int nr_vcpus)
408 {
409 vm->host_kvm = kvm;
410 vm->created_vcpus = nr_vcpus;
411 vm->arch.pkvm.pvmfw_load_addr = kvm->arch.pkvm.pvmfw_load_addr;
412 vm->arch.pkvm.enabled = READ_ONCE(kvm->arch.pkvm.enabled);
413 }
414
init_shadow_vcpu(struct shadow_vcpu_state * shadow_state,struct kvm_vcpu * host_vcpu,struct kvm_shadow_vm * vm,int vcpu_idx)415 static int init_shadow_vcpu(struct shadow_vcpu_state *shadow_state,
416 struct kvm_vcpu *host_vcpu,
417 struct kvm_shadow_vm *vm, int vcpu_idx)
418 {
419 struct kvm_vcpu *shadow_vcpu = &shadow_state->vcpu;
420 int ret;
421
422 host_vcpu = kern_hyp_va(host_vcpu);
423 if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
424 return -EBUSY;
425
426 if (host_vcpu->vcpu_idx != vcpu_idx) {
427 ret = -EINVAL;
428 goto done;
429 }
430
431 shadow_vcpu->arch.pkvm.host_vcpu = host_vcpu;
432 shadow_vcpu->kvm = vm->host_kvm;
433 shadow_vcpu->vcpu_id = host_vcpu->vcpu_id;
434 shadow_vcpu->vcpu_idx = vcpu_idx;
435
436 ret = copy_features(shadow_vcpu, host_vcpu);
437 if (ret)
438 goto done;
439
440 ret = init_ptrauth(shadow_vcpu);
441 if (ret)
442 goto done;
443
444 if (test_bit(KVM_ARM_VCPU_SVE, shadow_vcpu->arch.features)) {
445 size_t sve_state_size;
446 void *sve_state;
447
448 shadow_vcpu->arch.sve_state =
449 READ_ONCE(host_vcpu->arch.sve_state);
450 shadow_vcpu->arch.sve_max_vl =
451 READ_ONCE(host_vcpu->arch.sve_max_vl);
452
453 sve_state = kern_hyp_va(shadow_vcpu->arch.sve_state);
454 sve_state_size = vcpu_sve_state_size(shadow_vcpu);
455
456 if (!shadow_vcpu->arch.sve_state || !sve_state_size ||
457 hyp_pin_shared_mem(sve_state, sve_state + sve_state_size)) {
458 clear_bit(KVM_ARM_VCPU_SVE, shadow_vcpu->arch.features);
459 shadow_vcpu->arch.sve_state = NULL;
460 shadow_vcpu->arch.sve_max_vl = 0;
461 ret = -EINVAL;
462 goto done;
463 }
464 }
465
466 if (vm->arch.pkvm.enabled)
467 pkvm_vcpu_init_traps(shadow_vcpu);
468 kvm_reset_pvm_sys_regs(shadow_vcpu);
469
470 vm->vcpus[vcpu_idx] = shadow_vcpu;
471 shadow_state->vm = vm;
472
473 shadow_vcpu->arch.hw_mmu = &vm->arch.mmu;
474 shadow_vcpu->arch.pkvm.shadow_vm = vm;
475 shadow_vcpu->arch.power_off = true;
476
477 if (test_bit(KVM_ARM_VCPU_POWER_OFF, shadow_vcpu->arch.features)) {
478 shadow_vcpu->arch.pkvm.power_state =
479 PSCI_0_2_AFFINITY_LEVEL_OFF;
480 } else if (pvm_has_pvmfw(vm)) {
481 if (vm->pvmfw_entry_vcpu) {
482 ret = -EINVAL;
483 goto done;
484 }
485
486 vm->pvmfw_entry_vcpu = shadow_vcpu;
487 shadow_vcpu->arch.reset_state.reset = true;
488 shadow_vcpu->arch.pkvm.power_state =
489 PSCI_0_2_AFFINITY_LEVEL_ON_PENDING;
490 } else {
491 struct vcpu_reset_state *reset_state =
492 &shadow_vcpu->arch.reset_state;
493
494 reset_state->pc = *vcpu_pc(host_vcpu);
495 reset_state->r0 = vcpu_get_reg(host_vcpu, 0);
496 reset_state->reset = true;
497 shadow_vcpu->arch.pkvm.power_state =
498 PSCI_0_2_AFFINITY_LEVEL_ON_PENDING;
499 }
500
501 done:
502 if (ret)
503 unpin_host_vcpu(shadow_state);
504
505 return ret;
506 }
507
__exists_shadow(struct kvm * host_kvm)508 static bool __exists_shadow(struct kvm *host_kvm)
509 {
510 int i;
511 int num_checked = 0;
512
513 for (i = 0; i < KVM_MAX_PVMS && num_checked < num_shadow_entries; i++) {
514 if (!shadow_table[i])
515 continue;
516
517 if (unlikely(shadow_table[i]->host_kvm == host_kvm))
518 return true;
519
520 num_checked++;
521 }
522
523 return false;
524 }
525
526 /*
527 * Allocate a shadow table entry and insert a pointer to the shadow vm.
528 *
529 * Return a unique handle to the protected VM on success,
530 * negative error code on failure.
531 */
insert_shadow_table(struct kvm * kvm,struct kvm_shadow_vm * vm,size_t shadow_size)532 static int insert_shadow_table(struct kvm *kvm, struct kvm_shadow_vm *vm,
533 size_t shadow_size)
534 {
535 struct kvm_s2_mmu *mmu = &vm->arch.mmu;
536 int shadow_handle;
537 int vmid;
538
539 hyp_assert_lock_held(&shadow_lock);
540
541 if (unlikely(num_shadow_entries >= KVM_MAX_PVMS))
542 return -ENOMEM;
543
544 /*
545 * Initializing protected state might have failed, yet a malicious host
546 * could trigger this function. Thus, ensure that shadow_table exists.
547 */
548 if (unlikely(!shadow_table))
549 return -EINVAL;
550
551 /* Check that a shadow hasn't been created before for this host KVM. */
552 if (unlikely(__exists_shadow(kvm)))
553 return -EEXIST;
554
555 /* Find the next free entry in the shadow table. */
556 while (shadow_table[next_shadow_alloc])
557 next_shadow_alloc = (next_shadow_alloc + 1) % KVM_MAX_PVMS;
558 shadow_handle = index_to_shadow_handle(next_shadow_alloc);
559
560 vm->shadow_handle = shadow_handle;
561 vm->shadow_area_size = shadow_size;
562
563 /* VMID 0 is reserved for the host */
564 vmid = next_shadow_alloc + 1;
565 if (vmid > 0xff)
566 return -ENOMEM;
567
568 mmu->vmid.vmid = vmid;
569 mmu->vmid.vmid_gen = 0;
570 mmu->arch = &vm->arch;
571 mmu->pgt = &vm->pgt;
572
573 shadow_table[next_shadow_alloc] = vm;
574 next_shadow_alloc = (next_shadow_alloc + 1) % KVM_MAX_PVMS;
575 num_shadow_entries++;
576
577 return shadow_handle;
578 }
579
580 /*
581 * Deallocate and remove the shadow table entry corresponding to the handle.
582 */
remove_shadow_table(int shadow_handle)583 static void remove_shadow_table(int shadow_handle)
584 {
585 hyp_assert_lock_held(&shadow_lock);
586 shadow_table[shadow_handle_to_index(shadow_handle)] = NULL;
587 num_shadow_entries--;
588 }
589
pkvm_get_shadow_size(int num_vcpus)590 static size_t pkvm_get_shadow_size(int num_vcpus)
591 {
592 /* Shadow space for the vm struct and all of its vcpu states. */
593 return sizeof(struct kvm_shadow_vm) +
594 sizeof(struct shadow_vcpu_state *) * num_vcpus;
595 }
596
597 /*
598 * Check whether the size of the area donated by the host is sufficient for
599 * the shadow structues required for nr_vcpus as well as the shadow vm.
600 */
check_shadow_size(int nr_vcpus,size_t shadow_size)601 static int check_shadow_size(int nr_vcpus, size_t shadow_size)
602 {
603 if (nr_vcpus < 1 || nr_vcpus > KVM_MAX_VCPUS)
604 return -EINVAL;
605
606 /*
607 * Shadow size is rounded up when allocated and donated by the host,
608 * so it's likely to be larger than the sum of the struct sizes.
609 */
610 if (shadow_size < pkvm_get_shadow_size(nr_vcpus))
611 return -EINVAL;
612
613 return 0;
614 }
615
drain_shadow_vcpus(struct shadow_vcpu_state * shadow_vcpus[],unsigned int nr_vcpus,struct kvm_hyp_memcache * mc)616 static void drain_shadow_vcpus(struct shadow_vcpu_state *shadow_vcpus[],
617 unsigned int nr_vcpus,
618 struct kvm_hyp_memcache *mc)
619 {
620 int i;
621
622 for (i = 0; i < nr_vcpus; i++) {
623 struct kvm_vcpu *shadow_vcpu = &shadow_vcpus[i]->vcpu;
624 struct kvm_hyp_memcache *vcpu_mc = &shadow_vcpu->arch.pkvm_memcache;
625 void *addr;
626
627 while (vcpu_mc->nr_pages) {
628 addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt);
629 push_hyp_memcache(mc, addr, hyp_virt_to_phys);
630 WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
631 }
632 }
633 }
634
635 /*
636 * Initialize the shadow copy of the protected VM state using the memory
637 * donated by the host.
638 *
639 * Unmaps the donated memory from the host at stage 2.
640 *
641 * kvm: A pointer to the host's struct kvm (host va).
642 * shadow_va: The host va of the area being donated for the shadow state.
643 * Must be page aligned.
644 * shadow_size: The size of the area being donated for the shadow state.
645 * Must be a multiple of the page size.
646 * pgd: The host va of the area being donated for the stage-2 PGD for the VM.
647 * Must be page aligned. Its size is implied by the VM's VTCR.
648 *
649 * Return a unique handle to the protected VM on success,
650 * negative error code on failure.
651 */
__pkvm_init_shadow(struct kvm * kvm,void * shadow_va,size_t shadow_size,void * pgd)652 int __pkvm_init_shadow(struct kvm *kvm,
653 void *shadow_va,
654 size_t shadow_size,
655 void *pgd)
656 {
657 struct kvm_shadow_vm *vm = kern_hyp_va(shadow_va);
658 phys_addr_t shadow_pa = hyp_virt_to_phys(vm);
659 u64 pfn = hyp_phys_to_pfn(shadow_pa);
660 u64 nr_shadow_pages = shadow_size >> PAGE_SHIFT;
661 u64 nr_pgd_pages;
662 size_t pgd_size;
663 int nr_vcpus = 0;
664 int ret = 0;
665
666 /* Check that the donated memory is aligned to page boundaries. */
667 if (!PAGE_ALIGNED(shadow_va) ||
668 !PAGE_ALIGNED(shadow_size) ||
669 !PAGE_ALIGNED(pgd))
670 return -EINVAL;
671
672 kvm = kern_hyp_va(kvm);
673 pgd = kern_hyp_va(pgd);
674
675 ret = hyp_pin_shared_mem(kvm, kvm + 1);
676 if (ret)
677 return ret;
678
679 /* Ensure the host has donated enough memory for the shadow structs. */
680 nr_vcpus = kvm->created_vcpus;
681 ret = check_shadow_size(nr_vcpus, shadow_size);
682 if (ret)
683 goto err;
684
685 ret = __pkvm_host_donate_hyp(pfn, nr_shadow_pages);
686 if (ret)
687 goto err;
688
689 /* Ensure we're working with a clean slate. */
690 memset(vm, 0, shadow_size);
691
692 vm->arch.vtcr = host_kvm.arch.vtcr;
693 pgd_size = kvm_pgtable_stage2_pgd_size(host_kvm.arch.vtcr);
694 nr_pgd_pages = pgd_size >> PAGE_SHIFT;
695 ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn(pgd), nr_pgd_pages);
696 if (ret)
697 goto err_remove_mappings;
698
699 init_shadow_vm(kvm, vm, nr_vcpus);
700
701 /* Add the entry to the shadow table. */
702 hyp_spin_lock(&shadow_lock);
703 ret = insert_shadow_table(kvm, vm, shadow_size);
704 if (ret < 0)
705 goto err_unlock;
706
707 ret = kvm_guest_prepare_stage2(vm, pgd);
708 if (ret)
709 goto err_remove_shadow_table;
710
711 hyp_spin_unlock(&shadow_lock);
712 return vm->shadow_handle;
713
714 err_remove_shadow_table:
715 remove_shadow_table(vm->shadow_handle);
716 err_unlock:
717 hyp_spin_unlock(&shadow_lock);
718 WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(pgd), nr_pgd_pages));
719 err_remove_mappings:
720 /* Clear the donated shadow memory on failure to avoid data leaks. */
721 memset(vm, 0, shadow_size);
722 WARN_ON(__pkvm_hyp_donate_host(hyp_phys_to_pfn(shadow_pa),
723 shadow_size >> PAGE_SHIFT));
724 err:
725 hyp_unpin_shared_mem(kvm, kvm + 1);
726 return ret;
727 }
728
729 /*
730 * Initialize the protected vcpu state shadow copy in host-donated memory.
731 *
732 * shadow_handle: The handle for the protected vm.
733 * host_vcpu: A pointer to the corresponding host vcpu (host va).
734 * shadow_vcpu_hva: The host va of the area being donated for the vcpu state.
735 * Must be page aligned. The size of the area must be equal to
736 * the paged-aligned size of kvm_shadow_vcpu_state.
737 *
738 * Return 0 on success, negative error code on failure.
739 */
__pkvm_init_shadow_vcpu(unsigned int shadow_handle,struct kvm_vcpu * host_vcpu,void * shadow_vcpu_hva)740 int __pkvm_init_shadow_vcpu(unsigned int shadow_handle,
741 struct kvm_vcpu *host_vcpu,
742 void *shadow_vcpu_hva)
743 {
744 struct kvm_shadow_vm *vm;
745 struct shadow_vcpu_state *shadow_state = kern_hyp_va(shadow_vcpu_hva);
746 size_t vcpu_state_sz = sizeof(*shadow_state);
747 u64 nr_pages = PAGE_ALIGN(vcpu_state_sz) >> PAGE_SHIFT;
748 unsigned int idx;
749 int ret;
750
751 if (!PAGE_ALIGNED(shadow_vcpu_hva))
752 return -EINVAL;
753
754 ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn(shadow_state),
755 nr_pages);
756 if (ret)
757 return ret;
758
759 memset(shadow_state, 0, vcpu_state_sz);
760
761 hyp_spin_lock(&shadow_lock);
762
763 vm = find_shadow_by_handle(shadow_handle);
764 if (!vm) {
765 ret = -ENOENT;
766 goto unlock;
767 }
768
769 idx = vm->nr_vcpus;
770 if (idx >= vm->created_vcpus) {
771 ret = -EINVAL;
772 goto unlock;
773 }
774
775 ret = init_shadow_vcpu(shadow_state, host_vcpu, vm, idx);
776 if (ret)
777 goto unlock;
778
779 vm->shadow_vcpus[idx] = shadow_state;
780 vm->nr_vcpus++;
781 unlock:
782 hyp_spin_unlock(&shadow_lock);
783
784 if (ret) {
785 memset(shadow_state, 0, vcpu_state_sz);
786 WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(shadow_state),
787 nr_pages));
788 }
789
790 return ret;
791 }
792
teardown_donated_memory(struct kvm_hyp_memcache * mc,void * addr,size_t size)793 static void teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr,
794 size_t size)
795 {
796 u64 pfn = hyp_phys_to_pfn(__hyp_pa(addr));
797 u64 nr_pages = size >> PAGE_SHIFT;
798 void *start;
799
800 memset(addr, 0, size);
801 kvm_flush_dcache_to_poc(addr, size);
802
803 for (start = addr; start < addr + size; start += PAGE_SIZE)
804 push_hyp_memcache(mc, start, hyp_virt_to_phys);
805
806 WARN_ON(__pkvm_hyp_donate_host(pfn, nr_pages));
807 }
808
__pkvm_teardown_shadow(int shadow_handle)809 int __pkvm_teardown_shadow(int shadow_handle)
810 {
811 struct kvm_hyp_memcache *mc;
812 struct kvm_shadow_vm *vm;
813 struct kvm *host_kvm;
814 unsigned int nr_vcpus;
815 int err;
816 int i;
817
818 /* Lookup then remove entry from the shadow table. */
819 hyp_spin_lock(&shadow_lock);
820 vm = find_shadow_by_handle(shadow_handle);
821 if (!vm) {
822 err = -ENOENT;
823 goto err_unlock;
824 }
825
826 if (WARN_ON(hyp_page_count(vm))) {
827 err = -EBUSY;
828 goto err_unlock;
829 }
830
831 host_kvm = vm->host_kvm;
832 nr_vcpus = vm->nr_vcpus;
833
834 /*
835 * Clear the tracking for last_loaded_vcpu for all cpus for this vm in
836 * case the same addresses for those vcpus are reused for future vms.
837 */
838 for (i = 0; i < hyp_nr_cpus; i++) {
839 struct kvm_vcpu **last_loaded_vcpu_ptr =
840 per_cpu_ptr(&last_loaded_vcpu, i);
841 struct kvm_vcpu *vcpu = *last_loaded_vcpu_ptr;
842
843 if (vcpu && vcpu->arch.pkvm.shadow_vm == vm)
844 *last_loaded_vcpu_ptr = NULL;
845 }
846
847 /* Ensure the VMID is clean before it can be reallocated */
848 __kvm_tlb_flush_vmid(&vm->arch.mmu);
849 remove_shadow_table(shadow_handle);
850 hyp_spin_unlock(&shadow_lock);
851
852 /* Reclaim guest pages, and page-table pages */
853 mc = &host_kvm->arch.pkvm.teardown_mc;
854 reclaim_guest_pages(vm, mc);
855 drain_shadow_vcpus(vm->shadow_vcpus, nr_vcpus, mc);
856 unpin_host_vcpus(vm->shadow_vcpus, nr_vcpus);
857
858 for (i = 0; i < nr_vcpus; i++)
859 teardown_donated_memory(mc, vm->shadow_vcpus[i],
860 PAGE_ALIGN(sizeof(vm->shadow_vcpus[i])));
861 teardown_donated_memory(mc, vm, vm->shadow_area_size);
862
863 hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
864 return 0;
865
866 err_unlock:
867 hyp_spin_unlock(&shadow_lock);
868 return err;
869 }
870
pkvm_load_pvmfw_pages(struct kvm_shadow_vm * vm,u64 ipa,phys_addr_t phys,u64 size)871 int pkvm_load_pvmfw_pages(struct kvm_shadow_vm *vm, u64 ipa, phys_addr_t phys,
872 u64 size)
873 {
874 struct kvm_protected_vm *pkvm = &vm->arch.pkvm;
875 u64 npages, offset = ipa - pkvm->pvmfw_load_addr;
876 void *src = hyp_phys_to_virt(pvmfw_base) + offset;
877
878 if (offset >= pvmfw_size)
879 return -EINVAL;
880
881 size = min(size, pvmfw_size - offset);
882 if (!PAGE_ALIGNED(size) || !PAGE_ALIGNED(src))
883 return -EINVAL;
884
885 npages = size >> PAGE_SHIFT;
886 while (npages--) {
887 void *dst;
888
889 dst = hyp_fixmap_map(phys);
890 if (!dst)
891 return -EINVAL;
892
893 /*
894 * No need for cache maintenance here, as the pgtable code will
895 * take care of this when installing the pte in the guest's
896 * stage-2 page table.
897 */
898 memcpy(dst, src, PAGE_SIZE);
899
900 hyp_fixmap_unmap();
901 src += PAGE_SIZE;
902 phys += PAGE_SIZE;
903 }
904
905 return 0;
906 }
907
pkvm_clear_pvmfw_pages(void)908 void pkvm_clear_pvmfw_pages(void)
909 {
910 void *addr = hyp_phys_to_virt(pvmfw_base);
911
912 memset(addr, 0, pvmfw_size);
913 kvm_flush_dcache_to_poc(addr, pvmfw_size);
914 }
915
916 /*
917 * This function sets the registers on the vcpu to their architecturally defined
918 * reset values.
919 *
920 * Note: Can only be called by the vcpu on itself, after it has been turned on.
921 */
pkvm_reset_vcpu(struct kvm_vcpu * vcpu)922 void pkvm_reset_vcpu(struct kvm_vcpu *vcpu)
923 {
924 struct vcpu_reset_state *reset_state = &vcpu->arch.reset_state;
925 struct kvm_shadow_vm *vm = vcpu->arch.pkvm.shadow_vm;
926
927 WARN_ON(!reset_state->reset);
928
929 init_ptrauth(vcpu);
930
931 /* Reset core registers */
932 memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
933 memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs));
934 vcpu_gp_regs(vcpu)->pstate = VCPU_RESET_PSTATE_EL1;
935
936 /* Reset system registers */
937 kvm_reset_pvm_sys_regs(vcpu);
938
939 /* Propagate initiator's endianness, after kvm_reset_pvm_sys_regs. */
940 if (reset_state->be)
941 kvm_vcpu_set_be(vcpu);
942
943 if (vm->pvmfw_entry_vcpu == vcpu) {
944 struct kvm_vcpu *host_vcpu = vcpu->arch.pkvm.host_vcpu;
945 u64 entry = vm->arch.pkvm.pvmfw_load_addr;
946 int i;
947
948 /* X0 - X14 provided by the VMM (preserved) */
949 for (i = 0; i <= 14; ++i)
950 vcpu_set_reg(vcpu, i, vcpu_get_reg(host_vcpu, i));
951
952 /* X15: Boot protocol version */
953 vcpu_set_reg(vcpu, 15, 0);
954
955 /* PC: IPA of pvmfw base */
956 *vcpu_pc(vcpu) = entry;
957
958 vm->pvmfw_entry_vcpu = NULL;
959
960 /* Auto enroll MMIO guard */
961 set_bit(KVM_ARCH_FLAG_MMIO_GUARD,
962 &vcpu->arch.pkvm.shadow_vm->arch.flags);
963 } else {
964 *vcpu_pc(vcpu) = reset_state->pc;
965 vcpu_set_reg(vcpu, 0, reset_state->r0);
966 }
967
968 reset_state->reset = false;
969
970 vcpu->arch.pkvm.exit_code = 0;
971
972 WARN_ON(vcpu->arch.pkvm.power_state != PSCI_0_2_AFFINITY_LEVEL_ON_PENDING);
973 WRITE_ONCE(vcpu->arch.power_off, false);
974 WRITE_ONCE(vcpu->arch.pkvm.power_state, PSCI_0_2_AFFINITY_LEVEL_ON);
975 }
976
pvm_mpidr_to_vcpu(struct kvm_shadow_vm * vm,unsigned long mpidr)977 struct kvm_vcpu *pvm_mpidr_to_vcpu(struct kvm_shadow_vm *vm, unsigned long mpidr)
978 {
979 struct kvm_vcpu *vcpu;
980 int i;
981
982 mpidr &= MPIDR_HWID_BITMASK;
983
984 for (i = 0; i < READ_ONCE(vm->nr_vcpus); i++) {
985 vcpu = vm->vcpus[i];
986
987 if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
988 return vcpu;
989 }
990
991 return NULL;
992 }
993
994 /*
995 * Returns true if the hypervisor handled PSCI call, and control should go back
996 * to the guest, or false if the host needs to do some additional work (i.e.,
997 * wake up the vcpu).
998 */
pvm_psci_vcpu_on(struct kvm_vcpu * source_vcpu)999 static bool pvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
1000 {
1001 struct kvm_shadow_vm *vm = source_vcpu->arch.pkvm.shadow_vm;
1002 struct kvm_vcpu *vcpu;
1003 struct vcpu_reset_state *reset_state;
1004 unsigned long cpu_id;
1005 unsigned long hvc_ret_val;
1006 int power_state;
1007
1008 cpu_id = smccc_get_arg1(source_vcpu);
1009 if (!kvm_psci_valid_affinity(source_vcpu, cpu_id)) {
1010 hvc_ret_val = PSCI_RET_INVALID_PARAMS;
1011 goto error;
1012 }
1013
1014 vcpu = pvm_mpidr_to_vcpu(vm, cpu_id);
1015
1016 /* Make sure the caller requested a valid vcpu. */
1017 if (!vcpu) {
1018 hvc_ret_val = PSCI_RET_INVALID_PARAMS;
1019 goto error;
1020 }
1021
1022 /*
1023 * Make sure the requested vcpu is not on to begin with.
1024 * Atomic to avoid race between vcpus trying to power on the same vcpu.
1025 */
1026 power_state = cmpxchg(&vcpu->arch.pkvm.power_state,
1027 PSCI_0_2_AFFINITY_LEVEL_OFF,
1028 PSCI_0_2_AFFINITY_LEVEL_ON_PENDING);
1029 switch (power_state) {
1030 case PSCI_0_2_AFFINITY_LEVEL_ON_PENDING:
1031 hvc_ret_val = PSCI_RET_ON_PENDING;
1032 goto error;
1033 case PSCI_0_2_AFFINITY_LEVEL_ON:
1034 hvc_ret_val = PSCI_RET_ALREADY_ON;
1035 goto error;
1036 case PSCI_0_2_AFFINITY_LEVEL_OFF:
1037 break;
1038 default:
1039 hvc_ret_val = PSCI_RET_INTERNAL_FAILURE;
1040 goto error;
1041 }
1042
1043 reset_state = &vcpu->arch.reset_state;
1044
1045 reset_state->pc = smccc_get_arg2(source_vcpu);
1046 reset_state->r0 = smccc_get_arg3(source_vcpu);
1047
1048 /* Propagate caller endianness */
1049 reset_state->be = kvm_vcpu_is_be(source_vcpu);
1050
1051 reset_state->reset = true;
1052
1053 /*
1054 * Return to the host, which should make the KVM_REQ_VCPU_RESET request
1055 * as well as kvm_vcpu_wake_up() to schedule the vcpu.
1056 */
1057 return false;
1058
1059 error:
1060 /* If there's an error go back straight to the guest. */
1061 smccc_set_retval(source_vcpu, hvc_ret_val, 0, 0, 0);
1062 return true;
1063 }
1064
pvm_psci_vcpu_affinity_info(struct kvm_vcpu * vcpu)1065 static bool pvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
1066 {
1067 int i, matching_cpus = 0;
1068 unsigned long mpidr;
1069 unsigned long target_affinity;
1070 unsigned long target_affinity_mask;
1071 unsigned long lowest_affinity_level;
1072 struct kvm_shadow_vm *vm = vcpu->arch.pkvm.shadow_vm;
1073 struct kvm_vcpu *tmp;
1074 unsigned long hvc_ret_val;
1075
1076 target_affinity = smccc_get_arg1(vcpu);
1077 lowest_affinity_level = smccc_get_arg2(vcpu);
1078
1079 if (!kvm_psci_valid_affinity(vcpu, target_affinity)) {
1080 hvc_ret_val = PSCI_RET_INVALID_PARAMS;
1081 goto done;
1082 }
1083
1084 /* Determine target affinity mask */
1085 target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
1086 if (!target_affinity_mask) {
1087 hvc_ret_val = PSCI_RET_INVALID_PARAMS;
1088 goto done;
1089 }
1090
1091 /* Ignore other bits of target affinity */
1092 target_affinity &= target_affinity_mask;
1093
1094 hvc_ret_val = PSCI_0_2_AFFINITY_LEVEL_OFF;
1095
1096 /*
1097 * If at least one vcpu matching target affinity is ON then return ON,
1098 * then if at least one is PENDING_ON then return PENDING_ON.
1099 * Otherwise, return OFF.
1100 */
1101 for (i = 0; i < READ_ONCE(vm->nr_vcpus); i++) {
1102 tmp = vm->vcpus[i];
1103 mpidr = kvm_vcpu_get_mpidr_aff(tmp);
1104
1105 if ((mpidr & target_affinity_mask) == target_affinity) {
1106 int power_state;
1107
1108 matching_cpus++;
1109 power_state = READ_ONCE(tmp->arch.pkvm.power_state);
1110 switch (power_state) {
1111 case PSCI_0_2_AFFINITY_LEVEL_ON_PENDING:
1112 hvc_ret_val = PSCI_0_2_AFFINITY_LEVEL_ON_PENDING;
1113 break;
1114 case PSCI_0_2_AFFINITY_LEVEL_ON:
1115 hvc_ret_val = PSCI_0_2_AFFINITY_LEVEL_ON;
1116 goto done;
1117 case PSCI_0_2_AFFINITY_LEVEL_OFF:
1118 break;
1119 default:
1120 hvc_ret_val = PSCI_RET_INTERNAL_FAILURE;
1121 goto done;
1122 }
1123 }
1124 }
1125
1126 if (!matching_cpus)
1127 hvc_ret_val = PSCI_RET_INVALID_PARAMS;
1128
1129 done:
1130 /* Nothing to be handled by the host. Go back to the guest. */
1131 smccc_set_retval(vcpu, hvc_ret_val, 0, 0, 0);
1132 return true;
1133 }
1134
1135 /*
1136 * Returns true if the hypervisor has handled the PSCI call, and control should
1137 * go back to the guest, or false if the host needs to do some additional work
1138 * (e.g., turn off and update vcpu scheduling status).
1139 */
pvm_psci_vcpu_off(struct kvm_vcpu * vcpu)1140 static bool pvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
1141 {
1142 WARN_ON(vcpu->arch.power_off);
1143 WARN_ON(vcpu->arch.pkvm.power_state != PSCI_0_2_AFFINITY_LEVEL_ON);
1144
1145 WRITE_ONCE(vcpu->arch.power_off, true);
1146 WRITE_ONCE(vcpu->arch.pkvm.power_state, PSCI_0_2_AFFINITY_LEVEL_OFF);
1147
1148 /* Return to the host so that it can finish powering off the vcpu. */
1149 return false;
1150 }
1151
pvm_psci_version(struct kvm_vcpu * vcpu)1152 static bool pvm_psci_version(struct kvm_vcpu *vcpu)
1153 {
1154 /* Nothing to be handled by the host. Go back to the guest. */
1155 smccc_set_retval(vcpu, KVM_ARM_PSCI_1_1, 0, 0, 0);
1156 return true;
1157 }
1158
pvm_psci_not_supported(struct kvm_vcpu * vcpu)1159 static bool pvm_psci_not_supported(struct kvm_vcpu *vcpu)
1160 {
1161 /* Nothing to be handled by the host. Go back to the guest. */
1162 smccc_set_retval(vcpu, PSCI_RET_NOT_SUPPORTED, 0, 0, 0);
1163 return true;
1164 }
1165
pvm_psci_features(struct kvm_vcpu * vcpu)1166 static bool pvm_psci_features(struct kvm_vcpu *vcpu)
1167 {
1168 u32 feature = smccc_get_arg1(vcpu);
1169 unsigned long val;
1170
1171 switch (feature) {
1172 case PSCI_0_2_FN_PSCI_VERSION:
1173 case PSCI_0_2_FN_CPU_SUSPEND:
1174 case PSCI_0_2_FN64_CPU_SUSPEND:
1175 case PSCI_0_2_FN_CPU_OFF:
1176 case PSCI_0_2_FN_CPU_ON:
1177 case PSCI_0_2_FN64_CPU_ON:
1178 case PSCI_0_2_FN_AFFINITY_INFO:
1179 case PSCI_0_2_FN64_AFFINITY_INFO:
1180 case PSCI_0_2_FN_SYSTEM_OFF:
1181 case PSCI_0_2_FN_SYSTEM_RESET:
1182 case PSCI_1_0_FN_PSCI_FEATURES:
1183 case PSCI_1_1_FN_SYSTEM_RESET2:
1184 case PSCI_1_1_FN64_SYSTEM_RESET2:
1185 case ARM_SMCCC_VERSION_FUNC_ID:
1186 val = PSCI_RET_SUCCESS;
1187 break;
1188 default:
1189 val = PSCI_RET_NOT_SUPPORTED;
1190 break;
1191 }
1192
1193 /* Nothing to be handled by the host. Go back to the guest. */
1194 smccc_set_retval(vcpu, val, 0, 0, 0);
1195 return true;
1196 }
1197
pkvm_handle_psci(struct kvm_vcpu * vcpu)1198 static bool pkvm_handle_psci(struct kvm_vcpu *vcpu)
1199 {
1200 u32 psci_fn = smccc_get_function(vcpu);
1201
1202 switch (psci_fn) {
1203 case PSCI_0_2_FN_CPU_ON:
1204 kvm_psci_narrow_to_32bit(vcpu);
1205 fallthrough;
1206 case PSCI_0_2_FN64_CPU_ON:
1207 return pvm_psci_vcpu_on(vcpu);
1208 case PSCI_0_2_FN_CPU_OFF:
1209 return pvm_psci_vcpu_off(vcpu);
1210 case PSCI_0_2_FN_AFFINITY_INFO:
1211 kvm_psci_narrow_to_32bit(vcpu);
1212 fallthrough;
1213 case PSCI_0_2_FN64_AFFINITY_INFO:
1214 return pvm_psci_vcpu_affinity_info(vcpu);
1215 case PSCI_0_2_FN_PSCI_VERSION:
1216 return pvm_psci_version(vcpu);
1217 case PSCI_1_0_FN_PSCI_FEATURES:
1218 return pvm_psci_features(vcpu);
1219 case PSCI_0_2_FN_SYSTEM_RESET:
1220 case PSCI_0_2_FN_CPU_SUSPEND:
1221 case PSCI_0_2_FN64_CPU_SUSPEND:
1222 case PSCI_0_2_FN_SYSTEM_OFF:
1223 case PSCI_1_1_FN_SYSTEM_RESET2:
1224 case PSCI_1_1_FN64_SYSTEM_RESET2:
1225 return false; /* Handled by the host. */
1226 default:
1227 break;
1228 }
1229
1230 return pvm_psci_not_supported(vcpu);
1231 }
1232
__pkvm_memshare_page_req(struct kvm_vcpu * vcpu,u64 ipa)1233 static u64 __pkvm_memshare_page_req(struct kvm_vcpu *vcpu, u64 ipa)
1234 {
1235 u64 elr;
1236
1237 /* Fake up a data abort (Level 3 translation fault on write) */
1238 vcpu->arch.fault.esr_el2 = (u32)ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT |
1239 ESR_ELx_WNR | ESR_ELx_FSC_FAULT |
1240 FIELD_PREP(ESR_ELx_FSC_LEVEL, 3);
1241
1242 /* Shuffle the IPA around into the HPFAR */
1243 vcpu->arch.fault.hpfar_el2 = (ipa >> 8) & HPFAR_MASK;
1244
1245 /* This is a virtual address. 0's good. Let's go with 0. */
1246 vcpu->arch.fault.far_el2 = 0;
1247
1248 /* Rewind the ELR so we return to the HVC once the IPA is mapped */
1249 elr = read_sysreg(elr_el2);
1250 elr -=4;
1251 write_sysreg(elr, elr_el2);
1252
1253 return ARM_EXCEPTION_TRAP;
1254 }
1255
pkvm_memshare_call(struct kvm_vcpu * vcpu,u64 * exit_code)1256 static bool pkvm_memshare_call(struct kvm_vcpu *vcpu, u64 *exit_code)
1257 {
1258 u64 ipa = smccc_get_arg1(vcpu);
1259 u64 arg2 = smccc_get_arg2(vcpu);
1260 u64 arg3 = smccc_get_arg3(vcpu);
1261 int err;
1262
1263 if (arg2 || arg3)
1264 goto out_guest_err;
1265
1266 err = __pkvm_guest_share_host(vcpu, ipa);
1267 switch (err) {
1268 case 0:
1269 /* Success! Now tell the host. */
1270 goto out_host;
1271 case -EFAULT:
1272 /*
1273 * Convert the exception into a data abort so that the page
1274 * being shared is mapped into the guest next time.
1275 */
1276 *exit_code = __pkvm_memshare_page_req(vcpu, ipa);
1277 goto out_host;
1278 }
1279
1280 out_guest_err:
1281 smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0);
1282 return true;
1283
1284 out_host:
1285 return false;
1286 }
1287
pkvm_memunshare_call(struct kvm_vcpu * vcpu)1288 static bool pkvm_memunshare_call(struct kvm_vcpu *vcpu)
1289 {
1290 u64 ipa = smccc_get_arg1(vcpu);
1291 u64 arg2 = smccc_get_arg2(vcpu);
1292 u64 arg3 = smccc_get_arg3(vcpu);
1293 int err;
1294
1295 if (arg2 || arg3)
1296 goto out_guest_err;
1297
1298 err = __pkvm_guest_unshare_host(vcpu, ipa);
1299 if (err)
1300 goto out_guest_err;
1301
1302 return false;
1303
1304 out_guest_err:
1305 smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0);
1306 return true;
1307 }
1308
pkvm_install_ioguard_page(struct kvm_vcpu * vcpu,u64 * exit_code)1309 static bool pkvm_install_ioguard_page(struct kvm_vcpu *vcpu, u64 *exit_code)
1310 {
1311 u64 retval = SMCCC_RET_SUCCESS;
1312 u64 ipa = smccc_get_arg1(vcpu);
1313 int ret;
1314
1315 ret = __pkvm_install_ioguard_page(vcpu, ipa);
1316 if (ret == -ENOMEM) {
1317 /*
1318 * We ran out of memcache, let's ask for more. Cancel
1319 * the effects of the HVC that took us here, and
1320 * forward the hypercall to the host for page donation
1321 * purposes.
1322 */
1323 write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR);
1324 return false;
1325 }
1326
1327 if (ret)
1328 retval = SMCCC_RET_INVALID_PARAMETER;
1329
1330 smccc_set_retval(vcpu, retval, 0, 0, 0);
1331 return true;
1332 }
1333
1334 bool smccc_trng_available;
1335
pkvm_forward_trng(struct kvm_vcpu * vcpu)1336 static bool pkvm_forward_trng(struct kvm_vcpu *vcpu)
1337 {
1338 u32 fn = smccc_get_function(vcpu);
1339 struct arm_smccc_res res;
1340 unsigned long arg1 = 0;
1341
1342 /*
1343 * Forward TRNG calls to EL3, as we can't trust the host to handle
1344 * these for us.
1345 */
1346 switch (fn) {
1347 case ARM_SMCCC_TRNG_FEATURES:
1348 case ARM_SMCCC_TRNG_RND32:
1349 case ARM_SMCCC_TRNG_RND64:
1350 arg1 = smccc_get_arg1(vcpu);
1351 fallthrough;
1352 case ARM_SMCCC_TRNG_VERSION:
1353 case ARM_SMCCC_TRNG_GET_UUID:
1354 arm_smccc_1_1_smc(fn, arg1, &res);
1355 smccc_set_retval(vcpu, res.a0, res.a1, res.a2, res.a3);
1356 memzero_explicit(&res, sizeof(res));
1357 break;
1358 }
1359
1360 return true;
1361 }
1362
1363 /*
1364 * Handler for protected VM HVC calls.
1365 *
1366 * Returns true if the hypervisor has handled the exit, and control should go
1367 * back to the guest, or false if it hasn't.
1368 */
kvm_handle_pvm_hvc64(struct kvm_vcpu * vcpu,u64 * exit_code)1369 bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
1370 {
1371 u32 fn = smccc_get_function(vcpu);
1372 u64 val[4] = { SMCCC_RET_NOT_SUPPORTED };
1373
1374 switch (fn) {
1375 case ARM_SMCCC_VERSION_FUNC_ID:
1376 /* Nothing to be handled by the host. Go back to the guest. */
1377 val[0] = ARM_SMCCC_VERSION_1_1;
1378 break;
1379 case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
1380 val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
1381 val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
1382 val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
1383 val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
1384 break;
1385 case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
1386 val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
1387 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO);
1388 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE);
1389 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE);
1390 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO);
1391 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL);
1392 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP);
1393 val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP);
1394 break;
1395 case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID:
1396 set_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vcpu->arch.pkvm.shadow_vm->arch.flags);
1397 val[0] = SMCCC_RET_SUCCESS;
1398 break;
1399 case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID:
1400 return pkvm_install_ioguard_page(vcpu, exit_code);
1401 case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID:
1402 if (__pkvm_remove_ioguard_page(vcpu, vcpu_get_reg(vcpu, 1)))
1403 val[0] = SMCCC_RET_INVALID_PARAMETER;
1404 else
1405 val[0] = SMCCC_RET_SUCCESS;
1406 break;
1407 case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID:
1408 case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID:
1409 if (smccc_get_arg1(vcpu) ||
1410 smccc_get_arg2(vcpu) ||
1411 smccc_get_arg3(vcpu)) {
1412 val[0] = SMCCC_RET_INVALID_PARAMETER;
1413 } else {
1414 val[0] = PAGE_SIZE;
1415 }
1416 break;
1417 case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
1418 return pkvm_memshare_call(vcpu, exit_code);
1419 case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
1420 return pkvm_memunshare_call(vcpu);
1421 case ARM_SMCCC_TRNG_VERSION ... ARM_SMCCC_TRNG_RND32:
1422 case ARM_SMCCC_TRNG_RND64:
1423 if (smccc_trng_available)
1424 return pkvm_forward_trng(vcpu);
1425 break;
1426 default:
1427 return pkvm_handle_psci(vcpu);
1428 }
1429
1430 smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
1431 return true;
1432 }
1433