1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5 */
6
7 #include <linux/cma.h>
8 #include <linux/dma-map-ops.h>
9 #include <linux/mman.h>
10 #include <linux/kvm_host.h>
11 #include <linux/io.h>
12 #include <linux/hugetlb.h>
13 #include <linux/interval_tree_generic.h>
14 #include <linux/sched/signal.h>
15 #include <trace/events/kvm.h>
16 #include <asm/pgalloc.h>
17 #include <asm/cacheflush.h>
18 #include <asm/kvm_arm.h>
19 #include <asm/kvm_mmu.h>
20 #include <asm/kvm_pgtable.h>
21 #include <asm/kvm_pkvm.h>
22 #include <asm/kvm_ras.h>
23 #include <asm/kvm_asm.h>
24 #include <asm/kvm_emulate.h>
25 #include <asm/kvm_pkvm.h>
26 #include <asm/virt.h>
27
28 #include "trace.h"
29
30 static struct kvm_pgtable *hyp_pgtable;
31 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
32
33 static unsigned long __ro_after_init hyp_idmap_start;
34 static unsigned long __ro_after_init hyp_idmap_end;
35 static phys_addr_t __ro_after_init hyp_idmap_vector;
36
37 u32 __ro_after_init hyp_va_bits;
38
39 static unsigned long __ro_after_init io_map_base;
40
41 static bool stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot);
42 static bool stage2_pte_is_counted(kvm_pte_t pte, u32 level);
43
44 static struct kvm_pgtable_pte_ops kvm_s2_pte_ops = {
45 .force_pte_cb = stage2_force_pte_cb,
46 .pte_is_counted_cb = stage2_pte_is_counted
47
48 };
49
50 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn)
51
__stage2_range_addr_end(phys_addr_t addr,phys_addr_t end,phys_addr_t size)52 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
53 phys_addr_t size)
54 {
55 phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
56
57 return (boundary - 1 < end - 1) ? boundary : end;
58 }
59
stage2_range_addr_end(phys_addr_t addr,phys_addr_t end)60 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
61 {
62 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
63
64 return __stage2_range_addr_end(addr, end, size);
65 }
66
67 /*
68 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
69 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
70 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
71 * long will also starve other vCPUs.
72 */
stage2_apply_range(struct kvm_s2_mmu * mmu,phys_addr_t addr,phys_addr_t end,int (* fn)(struct kvm_s2_mmu *,u64,u64),bool resched)73 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
74 phys_addr_t end,
75 int (*fn)(struct kvm_s2_mmu *, u64, u64),
76 bool resched)
77 {
78 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
79 int ret;
80 u64 next;
81
82 do {
83 next = stage2_range_addr_end(addr, end);
84 ret = fn(mmu, addr, next - addr);
85 if (ret)
86 break;
87
88 if (resched && next != end)
89 cond_resched_rwlock_write(&kvm->mmu_lock);
90 } while (addr = next, addr != end);
91
92 return ret;
93 }
94
95 #define stage2_apply_range_resched(mmu, addr, end, fn) \
96 stage2_apply_range(mmu, addr, end, fn, true)
97
98 /*
99 * Get the maximum number of page-tables pages needed to split a range
100 * of blocks into PAGE_SIZE PTEs. It assumes the range is already
101 * mapped at level 2, or at level 1 if allowed.
102 */
kvm_mmu_split_nr_page_tables(u64 range)103 static int kvm_mmu_split_nr_page_tables(u64 range)
104 {
105 int n = 0;
106
107 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
108 n += DIV_ROUND_UP(range, PUD_SIZE);
109 n += DIV_ROUND_UP(range, PMD_SIZE);
110 return n;
111 }
112
need_split_memcache_topup_or_resched(struct kvm * kvm)113 static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
114 {
115 struct kvm_mmu_memory_cache *cache;
116 u64 chunk_size, min;
117
118 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
119 return true;
120
121 chunk_size = kvm->arch.mmu.split_page_chunk_size;
122 min = kvm_mmu_split_nr_page_tables(chunk_size);
123 cache = &kvm->arch.mmu.split_page_cache;
124 return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
125 }
126
kvm_mmu_split_huge_pages(struct kvm * kvm,phys_addr_t addr,phys_addr_t end)127 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
128 phys_addr_t end)
129 {
130 struct kvm_mmu_memory_cache *cache;
131 struct kvm_pgtable *pgt;
132 int ret, cache_capacity;
133 u64 next, chunk_size;
134
135 lockdep_assert_held_write(&kvm->mmu_lock);
136
137 chunk_size = kvm->arch.mmu.split_page_chunk_size;
138 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
139
140 if (chunk_size == 0)
141 return 0;
142
143 cache = &kvm->arch.mmu.split_page_cache;
144
145 do {
146 if (need_split_memcache_topup_or_resched(kvm)) {
147 write_unlock(&kvm->mmu_lock);
148 cond_resched();
149 /* Eager page splitting is best-effort. */
150 ret = __kvm_mmu_topup_memory_cache(cache,
151 cache_capacity,
152 cache_capacity);
153 write_lock(&kvm->mmu_lock);
154 if (ret)
155 break;
156 }
157
158 pgt = kvm->arch.mmu.pgt;
159 if (!pgt)
160 return -EINVAL;
161
162 next = __stage2_range_addr_end(addr, end, chunk_size);
163 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache);
164 if (ret)
165 break;
166 } while (addr = next, addr != end);
167
168 return ret;
169 }
170
memslot_is_logging(struct kvm_memory_slot * memslot)171 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
172 {
173 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
174 }
175
176 /**
177 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8
178 * @kvm: pointer to kvm structure.
179 *
180 * Interface to HYP function to flush all VM TLB entries
181 */
kvm_arch_flush_remote_tlbs(struct kvm * kvm)182 int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
183 {
184 if (is_protected_kvm_enabled())
185 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
186 else
187 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
188 return 0;
189 }
190
kvm_arch_flush_remote_tlbs_range(struct kvm * kvm,gfn_t gfn,u64 nr_pages)191 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
192 gfn_t gfn, u64 nr_pages)
193 {
194 u64 size = nr_pages << PAGE_SHIFT;
195 u64 addr = gfn << PAGE_SHIFT;
196
197 if (is_protected_kvm_enabled())
198 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
199 else
200 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size);
201 return 0;
202 }
203
kvm_is_device_pfn(unsigned long pfn)204 static bool kvm_is_device_pfn(unsigned long pfn)
205 {
206 return !pfn_is_map_memory(pfn);
207 }
208
stage2_memcache_zalloc_page(void * arg)209 static void *stage2_memcache_zalloc_page(void *arg)
210 {
211 struct kvm_mmu_memory_cache *mc = arg;
212 void *virt;
213
214 /* Allocated with __GFP_ZERO, so no need to zero */
215 virt = kvm_mmu_memory_cache_alloc(mc);
216 if (virt)
217 kvm_account_pgtable_pages(virt, 1);
218 return virt;
219 }
220
kvm_host_zalloc_pages_exact(size_t size)221 static void *kvm_host_zalloc_pages_exact(size_t size)
222 {
223 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
224 }
225
kvm_s2_zalloc_pages_exact(size_t size)226 static void *kvm_s2_zalloc_pages_exact(size_t size)
227 {
228 void *virt = kvm_host_zalloc_pages_exact(size);
229
230 if (virt)
231 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT));
232 return virt;
233 }
234
kvm_s2_free_pages_exact(void * virt,size_t size)235 static void kvm_s2_free_pages_exact(void *virt, size_t size)
236 {
237 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT));
238 free_pages_exact(virt, size);
239 }
240
241 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
242
stage2_free_unlinked_table_rcu_cb(struct rcu_head * head)243 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
244 {
245 struct page *page = container_of(head, struct page, rcu_head);
246 void *pgtable = page_to_virt(page);
247 s8 level = page_private(page);
248
249 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, &kvm_s2_pte_ops,
250 pgtable, level);
251 }
252
stage2_free_unlinked_table(void * addr,s8 level)253 static void stage2_free_unlinked_table(void *addr, s8 level)
254 {
255 struct page *page = virt_to_page(addr);
256
257 set_page_private(page, (unsigned long)level);
258 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
259 }
260
kvm_host_get_page(void * addr)261 static void kvm_host_get_page(void *addr)
262 {
263 get_page(virt_to_page(addr));
264 }
265
kvm_host_put_page(void * addr)266 static void kvm_host_put_page(void *addr)
267 {
268 put_page(virt_to_page(addr));
269 }
270
kvm_s2_put_page(void * addr)271 static void kvm_s2_put_page(void *addr)
272 {
273 struct page *p = virt_to_page(addr);
274 /* Dropping last refcount, the page will be freed */
275 if (page_count(p) == 1)
276 kvm_account_pgtable_pages(addr, -1);
277 put_page(p);
278 }
279
kvm_host_page_count(void * addr)280 static int kvm_host_page_count(void *addr)
281 {
282 return page_count(virt_to_page(addr));
283 }
284
clean_dcache_guest_page(void * va,size_t size)285 static void clean_dcache_guest_page(void *va, size_t size)
286 {
287 __clean_dcache_guest_page(va, size);
288 }
289
invalidate_icache_guest_page(void * va,size_t size)290 static void invalidate_icache_guest_page(void *va, size_t size)
291 {
292 __invalidate_icache_guest_page(va, size);
293 }
294
__pinned_page_start(struct kvm_pinned_page * ppage)295 static u64 __pinned_page_start(struct kvm_pinned_page *ppage)
296 {
297 return ppage->ipa;
298 }
299
__pinned_page_end(struct kvm_pinned_page * ppage)300 static u64 __pinned_page_end(struct kvm_pinned_page *ppage)
301 {
302 return ppage->ipa + (1 << (ppage->order + PAGE_SHIFT)) - 1;
303 }
304
305 INTERVAL_TREE_DEFINE(struct kvm_pinned_page, node, u64, __subtree_last,
306 __pinned_page_start, __pinned_page_end, /* empty */,
307 kvm_pinned_pages);
308
309 #define for_ppage_node_in_range(kvm, start, end, __ppage, __tmp) \
310 for (__ppage = kvm_pinned_pages_iter_first(&(kvm)->arch.pkvm.pinned_pages, start, end - 1);\
311 __ppage && ({ __tmp = kvm_pinned_pages_iter_next(__ppage, start, end - 1); 1; }); \
312 __ppage = __tmp)
313
314 /*
315 * Unmapping vs dcache management:
316 *
317 * If a guest maps certain memory pages as uncached, all writes will
318 * bypass the data cache and go directly to RAM. However, the CPUs
319 * can still speculate reads (not writes) and fill cache lines with
320 * data.
321 *
322 * Those cache lines will be *clean* cache lines though, so a
323 * clean+invalidate operation is equivalent to an invalidate
324 * operation, because no cache lines are marked dirty.
325 *
326 * Those clean cache lines could be filled prior to an uncached write
327 * by the guest, and the cache coherent IO subsystem would therefore
328 * end up writing old data to disk.
329 *
330 * This is why right after unmapping a page/section and invalidating
331 * the corresponding TLBs, we flush to make sure the IO subsystem will
332 * never hit in the cache.
333 *
334 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
335 * we then fully enforce cacheability of RAM, no matter what the guest
336 * does.
337 */
338 /**
339 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range
340 * @mmu: The KVM stage-2 MMU pointer
341 * @start: The intermediate physical base address of the range to unmap
342 * @size: The size of the area to unmap
343 * @may_block: Whether or not we are permitted to block
344 *
345 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
346 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
347 * destroying the VM), otherwise another faulting VCPU may come in and mess
348 * with things behind our backs.
349 */
350
___unmap_stage2_range(struct kvm_s2_mmu * mmu,u64 addr,u64 size)351 static int ___unmap_stage2_range(struct kvm_s2_mmu *mmu, u64 addr, u64 size)
352 {
353 return KVM_PGT_FN(kvm_pgtable_stage2_unmap)(mmu->pgt, addr, size);
354 }
355
__unmap_stage2_range(struct kvm_s2_mmu * mmu,phys_addr_t start,u64 size,bool may_block)356 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
357 bool may_block)
358 {
359 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
360 phys_addr_t end = start + size;
361
362 if (is_protected_kvm_enabled() && kvm->arch.pkvm.enabled)
363 return;
364
365 lockdep_assert_held_write(&kvm->mmu_lock);
366 WARN_ON(size & ~PAGE_MASK);
367 WARN_ON(stage2_apply_range(mmu, start, end, ___unmap_stage2_range, may_block));
368 }
369
kvm_stage2_unmap_range(struct kvm_s2_mmu * mmu,phys_addr_t start,u64 size,bool may_block)370 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
371 u64 size, bool may_block)
372 {
373 __unmap_stage2_range(mmu, start, size, may_block);
374 }
375
__stage2_flush_range(struct kvm_s2_mmu * mmu,u64 addr,u64 size)376 static int __stage2_flush_range(struct kvm_s2_mmu *mmu, u64 addr, u64 size)
377 {
378 return KVM_PGT_FN(kvm_pgtable_stage2_flush)(mmu->pgt, addr, size);
379 }
380
kvm_stage2_flush_range(struct kvm_s2_mmu * mmu,phys_addr_t addr,phys_addr_t end)381 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
382 {
383 stage2_apply_range_resched(mmu, addr, end, __stage2_flush_range);
384 }
385
stage2_flush_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)386 static void stage2_flush_memslot(struct kvm *kvm,
387 struct kvm_memory_slot *memslot)
388 {
389 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
390 phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
391
392 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
393 }
394
395 /**
396 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
397 * @kvm: The struct kvm pointer
398 *
399 * Go through the stage 2 page tables and invalidate any cache lines
400 * backing memory already mapped to the VM.
401 */
stage2_flush_vm(struct kvm * kvm)402 static void stage2_flush_vm(struct kvm *kvm)
403 {
404 struct kvm_memslots *slots;
405 struct kvm_memory_slot *memslot;
406 int idx, bkt;
407
408 idx = srcu_read_lock(&kvm->srcu);
409 write_lock(&kvm->mmu_lock);
410
411 slots = kvm_memslots(kvm);
412 kvm_for_each_memslot(memslot, bkt, slots)
413 stage2_flush_memslot(kvm, memslot);
414
415 kvm_nested_s2_flush(kvm);
416
417 write_unlock(&kvm->mmu_lock);
418 srcu_read_unlock(&kvm->srcu, idx);
419 }
420
421 /**
422 * free_hyp_pgds - free Hyp-mode page tables
423 */
free_hyp_pgds(void)424 void __init free_hyp_pgds(void)
425 {
426 mutex_lock(&kvm_hyp_pgd_mutex);
427 if (hyp_pgtable) {
428 kvm_pgtable_hyp_destroy(hyp_pgtable);
429 kfree(hyp_pgtable);
430 hyp_pgtable = NULL;
431 }
432 mutex_unlock(&kvm_hyp_pgd_mutex);
433 }
434
kvm_host_owns_hyp_mappings(void)435 static bool kvm_host_owns_hyp_mappings(void)
436 {
437 if (is_kernel_in_hyp_mode())
438 return false;
439
440 if (static_branch_likely(&kvm_protected_mode_initialized))
441 return false;
442
443 /*
444 * This can happen at boot time when __create_hyp_mappings() is called
445 * after the hyp protection has been enabled, but the static key has
446 * not been flipped yet.
447 */
448 if (!hyp_pgtable && is_protected_kvm_enabled())
449 return false;
450
451 WARN_ON(!hyp_pgtable);
452
453 return true;
454 }
455
__create_hyp_mappings(unsigned long start,unsigned long size,unsigned long phys,enum kvm_pgtable_prot prot)456 int __create_hyp_mappings(unsigned long start, unsigned long size,
457 unsigned long phys, enum kvm_pgtable_prot prot)
458 {
459 int err;
460
461 if (WARN_ON(!kvm_host_owns_hyp_mappings()))
462 return -EINVAL;
463
464 mutex_lock(&kvm_hyp_pgd_mutex);
465 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
466 mutex_unlock(&kvm_hyp_pgd_mutex);
467
468 return err;
469 }
470
kvm_kaddr_to_phys(void * kaddr)471 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
472 {
473 if (!is_vmalloc_addr(kaddr)) {
474 BUG_ON(!virt_addr_valid(kaddr));
475 return __pa(kaddr);
476 } else {
477 return page_to_phys(vmalloc_to_page(kaddr)) +
478 offset_in_page(kaddr);
479 }
480 }
481
482 struct hyp_shared_pfn {
483 u64 pfn;
484 int count;
485 struct rb_node node;
486 };
487
488 static DEFINE_MUTEX(hyp_shared_pfns_lock);
489 static struct rb_root hyp_shared_pfns = RB_ROOT;
490
find_shared_pfn(u64 pfn,struct rb_node *** node,struct rb_node ** parent)491 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node,
492 struct rb_node **parent)
493 {
494 struct hyp_shared_pfn *this;
495
496 *node = &hyp_shared_pfns.rb_node;
497 *parent = NULL;
498 while (**node) {
499 this = container_of(**node, struct hyp_shared_pfn, node);
500 *parent = **node;
501 if (this->pfn < pfn)
502 *node = &((**node)->rb_left);
503 else if (this->pfn > pfn)
504 *node = &((**node)->rb_right);
505 else
506 return this;
507 }
508
509 return NULL;
510 }
511
share_pfn_hyp(u64 pfn)512 static int share_pfn_hyp(u64 pfn)
513 {
514 struct rb_node **node, *parent;
515 struct hyp_shared_pfn *this;
516 int ret = 0;
517
518 mutex_lock(&hyp_shared_pfns_lock);
519 this = find_shared_pfn(pfn, &node, &parent);
520 if (this) {
521 this->count++;
522 goto unlock;
523 }
524
525 this = kzalloc(sizeof(*this), GFP_KERNEL);
526 if (!this) {
527 ret = -ENOMEM;
528 goto unlock;
529 }
530
531 this->pfn = pfn;
532 this->count = 1;
533 rb_link_node(&this->node, parent, node);
534 rb_insert_color(&this->node, &hyp_shared_pfns);
535 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1);
536 unlock:
537 mutex_unlock(&hyp_shared_pfns_lock);
538
539 return ret;
540 }
541
unshare_pfn_hyp(u64 pfn)542 static int unshare_pfn_hyp(u64 pfn)
543 {
544 struct rb_node **node, *parent;
545 struct hyp_shared_pfn *this;
546 int ret = 0;
547
548 mutex_lock(&hyp_shared_pfns_lock);
549 this = find_shared_pfn(pfn, &node, &parent);
550 if (WARN_ON(!this)) {
551 ret = -ENOENT;
552 goto unlock;
553 }
554
555 this->count--;
556 if (this->count)
557 goto unlock;
558
559 rb_erase(&this->node, &hyp_shared_pfns);
560 kfree(this);
561 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1);
562 unlock:
563 mutex_unlock(&hyp_shared_pfns_lock);
564
565 return ret;
566 }
567
kvm_share_hyp(void * from,void * to)568 int kvm_share_hyp(void *from, void *to)
569 {
570 phys_addr_t start, end, cur;
571 u64 pfn;
572 int ret;
573
574 if (is_kernel_in_hyp_mode())
575 return 0;
576
577 /*
578 * The share hcall maps things in the 'fixed-offset' region of the hyp
579 * VA space, so we can only share physically contiguous data-structures
580 * for now.
581 */
582 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to))
583 return -EINVAL;
584
585 if (kvm_host_owns_hyp_mappings())
586 return create_hyp_mappings(from, to, PAGE_HYP);
587
588 start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
589 end = PAGE_ALIGN(__pa(to));
590 for (cur = start; cur < end; cur += PAGE_SIZE) {
591 pfn = __phys_to_pfn(cur);
592 ret = share_pfn_hyp(pfn);
593 if (ret)
594 return ret;
595 }
596
597 return 0;
598 }
599
kvm_unshare_hyp(void * from,void * to)600 void kvm_unshare_hyp(void *from, void *to)
601 {
602 phys_addr_t start, end, cur;
603 u64 pfn;
604
605 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from)
606 return;
607
608 start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
609 end = PAGE_ALIGN(__pa(to));
610 for (cur = start; cur < end; cur += PAGE_SIZE) {
611 pfn = __phys_to_pfn(cur);
612 WARN_ON(unshare_pfn_hyp(pfn));
613 }
614 }
615
616 /**
617 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
618 * @from: The virtual kernel start address of the range
619 * @to: The virtual kernel end address of the range (exclusive)
620 * @prot: The protection to be applied to this range
621 *
622 * The same virtual address as the kernel virtual address is also used
623 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
624 * physical pages.
625 */
create_hyp_mappings(void * from,void * to,enum kvm_pgtable_prot prot)626 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
627 {
628 phys_addr_t phys_addr;
629 unsigned long virt_addr;
630 unsigned long start = kern_hyp_va((unsigned long)from);
631 unsigned long end = kern_hyp_va((unsigned long)to);
632
633 if (is_kernel_in_hyp_mode())
634 return 0;
635
636 if (!kvm_host_owns_hyp_mappings())
637 return -EPERM;
638
639 start = start & PAGE_MASK;
640 end = PAGE_ALIGN(end);
641
642 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
643 int err;
644
645 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
646 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
647 prot);
648 if (err)
649 return err;
650 }
651
652 return 0;
653 }
654
__hyp_alloc_private_va_range(unsigned long base)655 static int __hyp_alloc_private_va_range(unsigned long base)
656 {
657 lockdep_assert_held(&kvm_hyp_pgd_mutex);
658
659 if (!PAGE_ALIGNED(base))
660 return -EINVAL;
661
662 /*
663 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
664 * allocating the new area, as it would indicate we've
665 * overflowed the idmap/IO address range.
666 */
667 if ((base ^ io_map_base) & BIT(VA_BITS - 1))
668 return -ENOMEM;
669
670 io_map_base = base;
671
672 return 0;
673 }
674
675 /**
676 * hyp_alloc_private_va_range - Allocates a private VA range.
677 * @size: The size of the VA range to reserve.
678 * @haddr: The hypervisor virtual start address of the allocation.
679 *
680 * The private virtual address (VA) range is allocated below io_map_base
681 * and aligned based on the order of @size.
682 *
683 * Return: 0 on success or negative error code on failure.
684 */
hyp_alloc_private_va_range(size_t size,unsigned long * haddr)685 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
686 {
687 unsigned long base;
688 int ret = 0;
689
690 mutex_lock(&kvm_hyp_pgd_mutex);
691
692 /*
693 * This assumes that we have enough space below the idmap
694 * page to allocate our VAs. If not, the check in
695 * __hyp_alloc_private_va_range() will kick. A potential
696 * alternative would be to detect that overflow and switch
697 * to an allocation above the idmap.
698 *
699 * The allocated size is always a multiple of PAGE_SIZE.
700 */
701 size = PAGE_ALIGN(size);
702 base = io_map_base - size;
703 ret = __hyp_alloc_private_va_range(base);
704
705 mutex_unlock(&kvm_hyp_pgd_mutex);
706
707 if (!ret)
708 *haddr = base;
709
710 return ret;
711 }
712
__create_hyp_private_mapping(phys_addr_t phys_addr,size_t size,unsigned long * haddr,enum kvm_pgtable_prot prot)713 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
714 unsigned long *haddr,
715 enum kvm_pgtable_prot prot)
716 {
717 unsigned long addr;
718 int ret = 0;
719
720 if (!kvm_host_owns_hyp_mappings()) {
721 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
722 phys_addr, size, prot);
723 if (IS_ERR_VALUE(addr))
724 return addr;
725 *haddr = addr;
726
727 return 0;
728 }
729
730 size = PAGE_ALIGN(size + offset_in_page(phys_addr));
731 ret = hyp_alloc_private_va_range(size, &addr);
732 if (ret)
733 return ret;
734
735 ret = __create_hyp_mappings(addr, size, phys_addr, prot);
736 if (ret)
737 return ret;
738
739 *haddr = addr + offset_in_page(phys_addr);
740 return ret;
741 }
742
create_hyp_stack(phys_addr_t phys_addr,unsigned long * haddr)743 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
744 {
745 unsigned long base;
746 size_t size;
747 int ret;
748
749 mutex_lock(&kvm_hyp_pgd_mutex);
750 /*
751 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies
752 * an alignment of our allocation on the order of the size.
753 */
754 size = NVHE_STACK_SIZE * 2;
755 base = ALIGN_DOWN(io_map_base - size, size);
756
757 ret = __hyp_alloc_private_va_range(base);
758
759 mutex_unlock(&kvm_hyp_pgd_mutex);
760
761 if (ret) {
762 kvm_err("Cannot allocate hyp stack guard page\n");
763 return ret;
764 }
765
766 /*
767 * Since the stack grows downwards, map the stack to the page
768 * at the higher address and leave the lower guard page
769 * unbacked.
770 *
771 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1
772 * and addresses corresponding to the guard page have the
773 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection.
774 */
775 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE,
776 phys_addr, PAGE_HYP);
777 if (ret)
778 kvm_err("Cannot map hyp stack\n");
779
780 *haddr = base + size;
781
782 return ret;
783 }
784
785 /**
786 * create_hyp_io_mappings - Map IO into both kernel and HYP
787 * @phys_addr: The physical start address which gets mapped
788 * @size: Size of the region being mapped
789 * @kaddr: Kernel VA for this mapping
790 * @haddr: HYP VA for this mapping
791 */
create_hyp_io_mappings(phys_addr_t phys_addr,size_t size,void __iomem ** kaddr,void __iomem ** haddr)792 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
793 void __iomem **kaddr,
794 void __iomem **haddr)
795 {
796 unsigned long addr;
797 int ret;
798
799 if (is_protected_kvm_enabled())
800 return -EPERM;
801
802 *kaddr = ioremap(phys_addr, size);
803 if (!*kaddr)
804 return -ENOMEM;
805
806 if (is_kernel_in_hyp_mode()) {
807 *haddr = *kaddr;
808 return 0;
809 }
810
811 ret = __create_hyp_private_mapping(phys_addr, size,
812 &addr, PAGE_HYP_DEVICE);
813 if (ret) {
814 iounmap(*kaddr);
815 *kaddr = NULL;
816 *haddr = NULL;
817 return ret;
818 }
819
820 *haddr = (void __iomem *)addr;
821 return 0;
822 }
823
824 /**
825 * create_hyp_exec_mappings - Map an executable range into HYP
826 * @phys_addr: The physical start address which gets mapped
827 * @size: Size of the region being mapped
828 * @haddr: HYP VA for this mapping
829 */
create_hyp_exec_mappings(phys_addr_t phys_addr,size_t size,void ** haddr)830 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
831 void **haddr)
832 {
833 unsigned long addr;
834 int ret;
835
836 BUG_ON(is_kernel_in_hyp_mode());
837
838 ret = __create_hyp_private_mapping(phys_addr, size,
839 &addr, PAGE_HYP_EXEC);
840 if (ret) {
841 *haddr = NULL;
842 return ret;
843 }
844
845 *haddr = (void *)addr;
846 return 0;
847 }
848
849 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
850 /* We shouldn't need any other callback to walk the PT */
851 .phys_to_virt = kvm_host_va,
852 };
853
get_user_mapping_size(struct kvm * kvm,u64 addr)854 static int get_user_mapping_size(struct kvm *kvm, u64 addr)
855 {
856 struct kvm_pgtable pgt = {
857 .pgd = (kvm_pteref_t)kvm->mm->pgd,
858 .ia_bits = vabits_actual,
859 .start_level = (KVM_PGTABLE_LAST_LEVEL -
860 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1),
861 .mm_ops = &kvm_user_mm_ops,
862 };
863 unsigned long flags;
864 kvm_pte_t pte = 0; /* Keep GCC quiet... */
865 s8 level = S8_MAX;
866 int ret;
867
868 /*
869 * Disable IRQs so that we hazard against a concurrent
870 * teardown of the userspace page tables (which relies on
871 * IPI-ing threads).
872 */
873 local_irq_save(flags);
874 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
875 local_irq_restore(flags);
876
877 if (ret)
878 return ret;
879
880 /*
881 * Not seeing an error, but not updating level? Something went
882 * deeply wrong...
883 */
884 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL))
885 return -EFAULT;
886 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL))
887 return -EFAULT;
888
889 /* Oops, the userspace PTs are gone... Replay the fault */
890 if (!kvm_pte_valid(pte))
891 return -EAGAIN;
892
893 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
894 }
895
stage2_force_pte_cb(u64 addr,u64 end,enum kvm_pgtable_prot prot)896 static bool stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
897 {
898 return false;
899 }
900
stage2_pte_is_counted(kvm_pte_t pte,u32 level)901 static bool stage2_pte_is_counted(kvm_pte_t pte, u32 level)
902
903 {
904 return !!pte;
905 }
906
907 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
908 .zalloc_page = stage2_memcache_zalloc_page,
909 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
910 .free_pages_exact = kvm_s2_free_pages_exact,
911 .free_unlinked_table = stage2_free_unlinked_table,
912 .get_page = kvm_host_get_page,
913 .put_page = kvm_s2_put_page,
914 .page_count = kvm_host_page_count,
915 .phys_to_virt = kvm_host_va,
916 .virt_to_phys = kvm_host_pa,
917 .dcache_clean_inval_poc = clean_dcache_guest_page,
918 .icache_inval_pou = invalidate_icache_guest_page,
919 };
920
kvm_init_ipa_range(struct kvm_s2_mmu * mmu,unsigned long type)921 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
922 {
923 u32 kvm_ipa_limit = get_kvm_ipa_limit();
924 u64 mmfr0, mmfr1;
925 u32 phys_shift;
926
927 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
928 if (is_protected_kvm_enabled()) {
929 phys_shift = kvm_ipa_limit;
930 } else if (phys_shift) {
931 if (phys_shift > kvm_ipa_limit ||
932 phys_shift < ARM64_MIN_PARANGE_BITS)
933 return -EINVAL;
934 } else {
935 phys_shift = KVM_PHYS_SHIFT;
936 if (phys_shift > kvm_ipa_limit) {
937 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
938 current->comm);
939 return -EINVAL;
940 }
941 }
942
943 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
944 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
945 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
946
947 return 0;
948 }
949
950 /**
951 * kvm_init_stage2_mmu - Initialise a S2 MMU structure
952 * @kvm: The pointer to the KVM structure
953 * @mmu: The pointer to the s2 MMU structure
954 * @type: The machine type of the virtual machine
955 *
956 * Allocates only the stage-2 HW PGD level table(s).
957 * Note we don't need locking here as this is only called in two cases:
958 *
959 * - when the VM is created, which can't race against anything
960 *
961 * - when secondary kvm_s2_mmu structures are initialised for NV
962 * guests, and the caller must hold kvm->lock as this is called on a
963 * per-vcpu basis.
964 */
kvm_init_stage2_mmu(struct kvm * kvm,struct kvm_s2_mmu * mmu,unsigned long type)965 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
966 {
967 int cpu, err;
968 struct kvm_pgtable *pgt;
969
970 kvm->arch.pkvm.pinned_pages = RB_ROOT_CACHED;
971
972 /*
973 * If we already have our page tables in place, and that the
974 * MMU context is the canonical one, we have a bug somewhere,
975 * as this is only supposed to ever happen once per VM.
976 *
977 * Otherwise, we're building nested page tables, and that's
978 * probably because userspace called KVM_ARM_VCPU_INIT more
979 * than once on the same vcpu. Since that's actually legal,
980 * don't kick a fuss and leave gracefully.
981 */
982 if (mmu->pgt != NULL) {
983 if (kvm_is_nested_s2_mmu(kvm, mmu))
984 return 0;
985
986 kvm_err("kvm_arch already initialized?\n");
987 return -EINVAL;
988 }
989
990 err = kvm_init_ipa_range(mmu, type);
991 if (err)
992 return err;
993
994 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
995 if (!pgt)
996 return -ENOMEM;
997
998 mmu->arch = &kvm->arch;
999 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops, &kvm_s2_pte_ops);
1000 if (err)
1001 goto out_free_pgtable;
1002
1003 mmu->pgt = pgt;
1004 if (is_protected_kvm_enabled())
1005 return 0;
1006
1007 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
1008 if (!mmu->last_vcpu_ran) {
1009 err = -ENOMEM;
1010 goto out_destroy_pgtable;
1011 }
1012
1013 for_each_possible_cpu(cpu)
1014 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
1015
1016 /* The eager page splitting is disabled by default */
1017 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
1018 mmu->split_page_cache.gfp_zero = __GFP_ZERO;
1019
1020 mmu->pgd_phys = __pa(pgt->pgd);
1021
1022 if (kvm_is_nested_s2_mmu(kvm, mmu))
1023 kvm_init_nested_s2_mmu(mmu);
1024
1025 return 0;
1026
1027 out_destroy_pgtable:
1028 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
1029 out_free_pgtable:
1030 kfree(pgt);
1031 return err;
1032 }
1033
kvm_uninit_stage2_mmu(struct kvm * kvm)1034 void kvm_uninit_stage2_mmu(struct kvm *kvm)
1035 {
1036 write_lock(&kvm->mmu_lock);
1037 kvm_stage2_unmap_range(&kvm->arch.mmu, 0, BIT(VTCR_EL2_IPA(kvm->arch.mmu.vtcr)), true);
1038 write_unlock(&kvm->mmu_lock);
1039 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
1040 }
1041
stage2_unmap_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)1042 static void stage2_unmap_memslot(struct kvm *kvm,
1043 struct kvm_memory_slot *memslot)
1044 {
1045 hva_t hva = memslot->userspace_addr;
1046 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
1047 phys_addr_t size = PAGE_SIZE * memslot->npages;
1048 hva_t reg_end = hva + size;
1049
1050 /*
1051 * A memory region could potentially cover multiple VMAs, and any holes
1052 * between them, so iterate over all of them to find out if we should
1053 * unmap any of them.
1054 *
1055 * +--------------------------------------------+
1056 * +---------------+----------------+ +----------------+
1057 * | : VMA 1 | VMA 2 | | VMA 3 : |
1058 * +---------------+----------------+ +----------------+
1059 * | memory region |
1060 * +--------------------------------------------+
1061 */
1062 do {
1063 struct vm_area_struct *vma;
1064 hva_t vm_start, vm_end;
1065
1066 vma = find_vma_intersection(current->mm, hva, reg_end);
1067 if (!vma)
1068 break;
1069
1070 /*
1071 * Take the intersection of this VMA with the memory region
1072 */
1073 vm_start = max(hva, vma->vm_start);
1074 vm_end = min(reg_end, vma->vm_end);
1075
1076 if (!(vma->vm_flags & VM_PFNMAP)) {
1077 gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
1078 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true);
1079 }
1080 hva = vm_end;
1081 } while (hva < reg_end);
1082 }
1083
1084 /**
1085 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
1086 * @kvm: The struct kvm pointer
1087 *
1088 * Go through the memregions and unmap any regular RAM
1089 * backing memory already mapped to the VM.
1090 */
stage2_unmap_vm(struct kvm * kvm)1091 void stage2_unmap_vm(struct kvm *kvm)
1092 {
1093 struct kvm_memslots *slots;
1094 struct kvm_memory_slot *memslot;
1095 int idx, bkt;
1096
1097 idx = srcu_read_lock(&kvm->srcu);
1098 mmap_read_lock(current->mm);
1099 write_lock(&kvm->mmu_lock);
1100
1101 slots = kvm_memslots(kvm);
1102 kvm_for_each_memslot(memslot, bkt, slots)
1103 stage2_unmap_memslot(kvm, memslot);
1104
1105 kvm_nested_s2_unmap(kvm, true);
1106
1107 write_unlock(&kvm->mmu_lock);
1108 mmap_read_unlock(current->mm);
1109 srcu_read_unlock(&kvm->srcu, idx);
1110 }
1111
kvm_free_stage2_pgd(struct kvm_s2_mmu * mmu)1112 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
1113 {
1114 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
1115 struct kvm_pgtable *pgt = NULL;
1116
1117 write_lock(&kvm->mmu_lock);
1118 pgt = mmu->pgt;
1119 if (pgt) {
1120 mmu->pgd_phys = 0;
1121 mmu->pgt = NULL;
1122 free_percpu(mmu->last_vcpu_ran);
1123 }
1124 write_unlock(&kvm->mmu_lock);
1125
1126 if (pgt) {
1127 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
1128 kfree(pgt);
1129 }
1130 }
1131
hyp_mc_free_fn(void * addr,void * flags,unsigned long order)1132 static void hyp_mc_free_fn(void *addr, void *flags, unsigned long order)
1133 {
1134 static const u8 pmd_order = PMD_SHIFT - PAGE_SHIFT;
1135
1136 if (!addr)
1137 return;
1138
1139 if ((unsigned long)flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
1140 kvm_account_pgtable_pages(addr, -1);
1141
1142 /* The iommu pool supports top-up from dma_contiguous_default_area */
1143 if (order == pmd_order &&
1144 kvm_iommu_cma_release(virt_to_page(addr)))
1145 return;
1146
1147 free_pages((unsigned long)addr, order);
1148 }
1149
hyp_mc_alloc_fn(void * flags,unsigned long order)1150 static void *hyp_mc_alloc_fn(void *flags, unsigned long order)
1151 {
1152 unsigned long __flags = (unsigned long)flags;
1153 gfp_t gfp_mask;
1154 void *addr;
1155
1156 gfp_mask = __flags & HYP_MEMCACHE_ACCOUNT_KMEMCG ?
1157 GFP_KERNEL_ACCOUNT : GFP_KERNEL;
1158
1159 addr = (void *)__get_free_pages(gfp_mask, order);
1160
1161 if (addr && __flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
1162 kvm_account_pgtable_pages(addr, 1);
1163
1164 return addr;
1165 }
1166
hyp_mc_alloc_gfp_fn(void * flags,unsigned long order)1167 static void *hyp_mc_alloc_gfp_fn(void *flags, unsigned long order)
1168 {
1169 return (void *)__get_free_pages(*(gfp_t *)flags, order);
1170 }
1171
free_hyp_memcache(struct kvm_hyp_memcache * mc)1172 void free_hyp_memcache(struct kvm_hyp_memcache *mc)
1173 {
1174 unsigned long flags = mc->flags;
1175
1176 if (!is_protected_kvm_enabled())
1177 return;
1178
1179 kfree(mc->mapping);
1180 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, (void *)flags);
1181 }
1182
topup_hyp_memcache(struct kvm_hyp_memcache * mc,unsigned long min_pages,unsigned long order)1183 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
1184 unsigned long order)
1185 {
1186 unsigned long flags = mc->flags;
1187
1188 if (!is_protected_kvm_enabled())
1189 return 0;
1190
1191 if (order > PAGE_SHIFT)
1192 return -E2BIG;
1193
1194 if (!mc->mapping) {
1195 mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT);
1196 if (!mc->mapping)
1197 return -ENOMEM;
1198 }
1199
1200 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
1201 kvm_host_pa, (void *)flags, order);
1202 }
1203 EXPORT_SYMBOL(topup_hyp_memcache);
1204
topup_hyp_memcache_account(struct kvm * kvm,struct kvm_hyp_memcache * mc,unsigned long min_pages,unsigned long order)1205 static int topup_hyp_memcache_account(struct kvm *kvm, struct kvm_hyp_memcache *mc,
1206 unsigned long min_pages, unsigned long order)
1207 {
1208 u64 nr_pages = mc->nr_pages;
1209 int ret;
1210
1211 ret = topup_hyp_memcache(mc, min_pages, order);
1212 if (ret)
1213 return -ENOMEM;
1214
1215 nr_pages = mc->nr_pages - nr_pages;
1216 atomic64_add(nr_pages << PAGE_SHIFT, &kvm->stat.protected_hyp_mem);
1217
1218 return 0;
1219 }
1220
topup_hyp_memcache_gfp(struct kvm_hyp_memcache * mc,unsigned long min_pages,unsigned long order,gfp_t gfp)1221 int topup_hyp_memcache_gfp(struct kvm_hyp_memcache *mc, unsigned long min_pages,
1222 unsigned long order, gfp_t gfp)
1223 {
1224 void *flags = &gfp;
1225
1226 if (!is_protected_kvm_enabled())
1227 return 0;
1228
1229 if (order > PAGE_SHIFT)
1230 return -E2BIG;
1231
1232 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_gfp_fn,
1233 kvm_host_pa, flags, order);
1234 }
1235
1236 /**
1237 * kvm_phys_addr_ioremap - map a device range to guest IPA
1238 *
1239 * @kvm: The KVM pointer
1240 * @guest_ipa: The IPA at which to insert the mapping
1241 * @pa: The physical address of the device
1242 * @size: The size of the mapping
1243 * @writable: Whether or not to create a writable mapping
1244 */
kvm_phys_addr_ioremap(struct kvm * kvm,phys_addr_t guest_ipa,phys_addr_t pa,unsigned long size,bool writable)1245 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1246 phys_addr_t pa, unsigned long size, bool writable)
1247 {
1248 phys_addr_t addr;
1249 int ret = 0;
1250 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
1251 struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
1252 struct kvm_pgtable *pgt = mmu->pgt;
1253 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
1254 KVM_PGTABLE_PROT_R |
1255 (writable ? KVM_PGTABLE_PROT_W : 0);
1256
1257 if (is_protected_kvm_enabled())
1258 return -EPERM;
1259
1260 size += offset_in_page(guest_ipa);
1261 guest_ipa &= PAGE_MASK;
1262
1263 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
1264 ret = kvm_mmu_topup_memory_cache(&cache,
1265 kvm_mmu_cache_min_pages(mmu));
1266 if (ret)
1267 break;
1268
1269 write_lock(&kvm->mmu_lock);
1270 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE,
1271 pa, prot, &cache, 0);
1272 write_unlock(&kvm->mmu_lock);
1273 if (ret)
1274 break;
1275
1276 pa += PAGE_SIZE;
1277 }
1278
1279 kvm_mmu_free_memory_cache(&cache);
1280 return ret;
1281 }
1282
__stage2_wp_range(struct kvm_s2_mmu * mmu,u64 addr,u64 size)1283 static int __stage2_wp_range(struct kvm_s2_mmu *mmu, u64 addr, u64 size)
1284 {
1285 return KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)(mmu->pgt, addr, size);
1286 }
1287
1288 /**
1289 * kvm_stage2_wp_range() - write protect stage2 memory region range
1290 * @mmu: The KVM stage-2 MMU pointer
1291 * @addr: Start address of range
1292 * @end: End address of range
1293 */
kvm_stage2_wp_range(struct kvm_s2_mmu * mmu,phys_addr_t addr,phys_addr_t end)1294 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
1295 {
1296 stage2_apply_range_resched(mmu, addr, end, __stage2_wp_range);
1297 }
1298
1299 /**
1300 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1301 * @kvm: The KVM pointer
1302 * @slot: The memory slot to write protect
1303 *
1304 * Called to start logging dirty pages after memory region
1305 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1306 * all present PUD, PMD and PTEs are write protected in the memory region.
1307 * Afterwards read of dirty page log can be called.
1308 *
1309 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1310 * serializing operations for VM memory regions.
1311 */
kvm_mmu_wp_memory_region(struct kvm * kvm,int slot)1312 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1313 {
1314 struct kvm_memslots *slots = kvm_memslots(kvm);
1315 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1316 phys_addr_t start, end;
1317
1318 if (WARN_ON_ONCE(!memslot))
1319 return;
1320
1321 start = memslot->base_gfn << PAGE_SHIFT;
1322 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1323
1324 write_lock(&kvm->mmu_lock);
1325 kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
1326 kvm_nested_s2_wp(kvm);
1327 write_unlock(&kvm->mmu_lock);
1328 kvm_flush_remote_tlbs_memslot(kvm, memslot);
1329 }
1330
1331 /**
1332 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
1333 * pages for memory slot
1334 * @kvm: The KVM pointer
1335 * @slot: The memory slot to split
1336 *
1337 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
1338 * serializing operations for VM memory regions.
1339 */
kvm_mmu_split_memory_region(struct kvm * kvm,int slot)1340 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
1341 {
1342 struct kvm_memslots *slots;
1343 struct kvm_memory_slot *memslot;
1344 phys_addr_t start, end;
1345
1346 lockdep_assert_held(&kvm->slots_lock);
1347
1348 slots = kvm_memslots(kvm);
1349 memslot = id_to_memslot(slots, slot);
1350
1351 start = memslot->base_gfn << PAGE_SHIFT;
1352 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1353
1354 write_lock(&kvm->mmu_lock);
1355 kvm_mmu_split_huge_pages(kvm, start, end);
1356 write_unlock(&kvm->mmu_lock);
1357 }
1358
1359 /*
1360 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
1361 * @kvm: The KVM pointer
1362 * @slot: The memory slot associated with mask
1363 * @gfn_offset: The gfn offset in memory slot
1364 * @mask: The mask of pages at offset 'gfn_offset' in this memory
1365 * slot to enable dirty logging on
1366 *
1367 * Writes protect selected pages to enable dirty logging, and then
1368 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
1369 */
kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)1370 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1371 struct kvm_memory_slot *slot,
1372 gfn_t gfn_offset, unsigned long mask)
1373 {
1374 phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1375 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
1376 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1377
1378 lockdep_assert_held_write(&kvm->mmu_lock);
1379
1380 kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
1381
1382 /*
1383 * Eager-splitting is done when manual-protect is set. We
1384 * also check for initially-all-set because we can avoid
1385 * eager-splitting if initially-all-set is false.
1386 * Initially-all-set equal false implies that huge-pages were
1387 * already split when enabling dirty logging: no need to do it
1388 * again.
1389 */
1390 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1391 kvm_mmu_split_huge_pages(kvm, start, end);
1392
1393 kvm_nested_s2_wp(kvm);
1394 }
1395
kvm_send_hwpoison_signal(unsigned long address,short lsb)1396 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1397 {
1398 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1399 }
1400
fault_supports_stage2_huge_mapping(struct kvm_memory_slot * memslot,unsigned long hva,unsigned long map_size)1401 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1402 unsigned long hva,
1403 unsigned long map_size)
1404 {
1405 gpa_t gpa_start;
1406 hva_t uaddr_start, uaddr_end;
1407 size_t size;
1408
1409 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1410 if (map_size == PAGE_SIZE)
1411 return true;
1412
1413 size = memslot->npages * PAGE_SIZE;
1414
1415 gpa_start = memslot->base_gfn << PAGE_SHIFT;
1416
1417 uaddr_start = memslot->userspace_addr;
1418 uaddr_end = uaddr_start + size;
1419
1420 /*
1421 * Pages belonging to memslots that don't have the same alignment
1422 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1423 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1424 *
1425 * Consider a layout like the following:
1426 *
1427 * memslot->userspace_addr:
1428 * +-----+--------------------+--------------------+---+
1429 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
1430 * +-----+--------------------+--------------------+---+
1431 *
1432 * memslot->base_gfn << PAGE_SHIFT:
1433 * +---+--------------------+--------------------+-----+
1434 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
1435 * +---+--------------------+--------------------+-----+
1436 *
1437 * If we create those stage-2 blocks, we'll end up with this incorrect
1438 * mapping:
1439 * d -> f
1440 * e -> g
1441 * f -> h
1442 */
1443 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1444 return false;
1445
1446 /*
1447 * Next, let's make sure we're not trying to map anything not covered
1448 * by the memslot. This means we have to prohibit block size mappings
1449 * for the beginning and end of a non-block aligned and non-block sized
1450 * memory slot (illustrated by the head and tail parts of the
1451 * userspace view above containing pages 'abcde' and 'xyz',
1452 * respectively).
1453 *
1454 * Note that it doesn't matter if we do the check using the
1455 * userspace_addr or the base_gfn, as both are equally aligned (per
1456 * the check above) and equally sized.
1457 */
1458 return (hva & ~(map_size - 1)) >= uaddr_start &&
1459 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1460 }
1461
1462 /*
1463 * Check if the given hva is backed by a transparent huge page (THP) and
1464 * whether it can be mapped using block mapping in stage2. If so, adjust
1465 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1466 * supported. This will need to be updated to support other THP sizes.
1467 *
1468 * Returns the size of the mapping.
1469 */
1470 static long
transparent_hugepage_adjust(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long hva,kvm_pfn_t * pfnp,phys_addr_t * ipap)1471 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
1472 unsigned long hva, kvm_pfn_t *pfnp,
1473 phys_addr_t *ipap)
1474 {
1475 kvm_pfn_t pfn = *pfnp;
1476
1477 /*
1478 * Make sure the adjustment is done only for THP pages. Also make
1479 * sure that the HVA and IPA are sufficiently aligned and that the
1480 * block map is contained within the memslot.
1481 */
1482 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1483 int sz = get_user_mapping_size(kvm, hva);
1484
1485 if (sz < 0)
1486 return sz;
1487
1488 if (sz < PMD_SIZE)
1489 return PAGE_SIZE;
1490
1491 *ipap &= PMD_MASK;
1492 pfn &= ~(PTRS_PER_PMD - 1);
1493 *pfnp = pfn;
1494
1495 return PMD_SIZE;
1496 }
1497
1498 /* Use page mapping if we cannot use block mapping. */
1499 return PAGE_SIZE;
1500 }
1501
get_vma_page_shift(struct vm_area_struct * vma,unsigned long hva)1502 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
1503 {
1504 unsigned long pa;
1505
1506 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
1507 return huge_page_shift(hstate_vma(vma));
1508
1509 if (!(vma->vm_flags & VM_PFNMAP))
1510 return PAGE_SHIFT;
1511
1512 VM_BUG_ON(is_vm_hugetlb_page(vma));
1513
1514 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
1515
1516 #ifndef __PAGETABLE_PMD_FOLDED
1517 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
1518 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
1519 ALIGN(hva, PUD_SIZE) <= vma->vm_end)
1520 return PUD_SHIFT;
1521 #endif
1522
1523 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
1524 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
1525 ALIGN(hva, PMD_SIZE) <= vma->vm_end)
1526 return PMD_SHIFT;
1527
1528 return PAGE_SHIFT;
1529 }
1530
1531 /*
1532 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
1533 * able to see the page's tags and therefore they must be initialised first. If
1534 * PG_mte_tagged is set, tags have already been initialised.
1535 *
1536 * The race in the test/set of the PG_mte_tagged flag is handled by:
1537 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
1538 * racing to santise the same page
1539 * - mmap_lock protects between a VM faulting a page in and the VMM performing
1540 * an mprotect() to add VM_MTE
1541 */
sanitise_mte_tags(struct kvm * kvm,kvm_pfn_t pfn,unsigned long size)1542 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
1543 unsigned long size)
1544 {
1545 unsigned long i, nr_pages = size >> PAGE_SHIFT;
1546 struct page *page = pfn_to_page(pfn);
1547
1548 if (!kvm_has_mte(kvm))
1549 return;
1550
1551 for (i = 0; i < nr_pages; i++, page++) {
1552 if (try_page_mte_tagging(page)) {
1553 mte_clear_page_tags(page_address(page));
1554 set_page_mte_tagged(page);
1555 }
1556 }
1557 }
1558
kvm_vma_mte_allowed(struct vm_area_struct * vma)1559 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
1560 {
1561 return vma->vm_flags & VM_MTE_ALLOWED;
1562 }
1563
find_ppage(struct kvm * kvm,u64 ipa)1564 static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa)
1565 {
1566 return kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages,
1567 ipa, ipa + PAGE_SIZE - 1);
1568 }
1569
1570 static u16 pkvm_prefault __read_mostly;
1571
early_pkvm_prefault_cfg(char * buf)1572 static int __init early_pkvm_prefault_cfg(char *buf)
1573 {
1574 int ret = kstrtou16(buf, 10, &pkvm_prefault);
1575
1576 if (ret)
1577 return ret;
1578
1579 pkvm_prefault = min(pkvm_prefault, 9);
1580
1581 return 0;
1582 }
1583 early_param("kvm-arm.protected_prefault", early_pkvm_prefault_cfg);
1584
insert_ppage(struct kvm * kvm,struct kvm_pinned_page * ppage)1585 static int insert_ppage(struct kvm *kvm, struct kvm_pinned_page *ppage)
1586 {
1587 if (find_ppage(kvm, ppage->ipa))
1588 return -EEXIST;
1589
1590 kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
1591
1592 return 0;
1593 }
1594
__pkvm_align_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot,gfn_t gfn,size_t size)1595 static long __pkvm_align_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1596 gfn_t gfn, size_t size)
1597 {
1598 gfn_t memslot_end, gfn_end;
1599 unsigned long hva;
1600 bool writable;
1601
1602 size = PAGE_ALIGN(size);
1603
1604 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1605 if (kvm_is_error_hva(hva) || (kvm->arch.pkvm.enabled && !writable))
1606 return -EINVAL;
1607
1608 memslot_end = memslot->base_gfn + memslot->npages;
1609 gfn_end = min(gfn + (size >> PAGE_SHIFT), memslot_end);
1610
1611 return gfn_end - gfn;
1612 }
1613
__pkvm_pin_user_pages(struct kvm * kvm,struct kvm_memory_slot * memslot,u64 gfn,u64 nr_pages,struct page *** __pages)1614 static int __pkvm_pin_user_pages(struct kvm *kvm, struct kvm_memory_slot *memslot,
1615 u64 gfn, u64 nr_pages, struct page ***__pages)
1616 {
1617 unsigned long hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
1618 unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
1619 struct mm_struct *mm = current->mm;
1620 struct page **pages;
1621 long ret;
1622 int p;
1623
1624 pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
1625 if (!pages)
1626 return -ENOMEM;
1627
1628 mmap_read_lock(mm);
1629 ret = pin_user_pages(hva, nr_pages, flags, pages);
1630 mmap_read_unlock(mm);
1631
1632 if (ret == -EHWPOISON) {
1633 kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
1634 goto err_free_pages;
1635 } else if (ret == -EFAULT) {
1636 /* Will try MMIO map */
1637 ret = -EREMOTEIO;
1638 goto err_free_pages;
1639 } else if (ret < 0) {
1640 ret = -EFAULT;
1641 goto err_free_pages;
1642 } else if (ret != nr_pages) {
1643 nr_pages = ret;
1644 ret = -EFAULT;
1645 goto err_unpin_pages;
1646 }
1647
1648 /*
1649 * We really can't deal with page-cache pages returned by GUP
1650 * because (a) we may trigger writeback of a page for which we
1651 * no longer have access and (b) page_mkclean() won't find the
1652 * stage-2 mapping in the rmap so we can get out-of-whack with
1653 * the filesystem when marking the page dirty during unpinning
1654 * (see cc5095747edf ("ext4: don't BUG if someone dirty pages
1655 * without asking ext4 first")).
1656 *
1657 * Ideally we'd just restrict ourselves to anonymous pages, but
1658 * we also want to allow memfd (i.e. shmem) pages, so check for
1659 * pages backed by swap in the knowledge that the GUP pin will
1660 * prevent try_to_unmap() from succeeding.
1661 */
1662 for (p = 0; p < nr_pages; p++) {
1663 if (!folio_test_swapbacked(page_folio(pages[p]))) {
1664 ret = -EIO;
1665 goto err_unpin_pages;
1666 }
1667 }
1668
1669 *__pages = pages;
1670 return 0;
1671
1672 err_unpin_pages:
1673 unpin_user_pages(pages, nr_pages);
1674 err_free_pages:
1675 kfree(pages);
1676 return ret;
1677 }
1678
1679 /*
1680 * pKVM relies on pinning the page then getting the pfn from there to map it,
1681 * However, to avoid adding overhead on the hot path with checking pfn first,
1682 * device check is done on the fail path for pin_user_pages, inside -EFAULT
1683 * case, that is possible because the VMA for the device mapping is VM_IO,
1684 * which fails in check_vma_flags() with -EFAULT
1685 */
__pkvm_mem_abort_device(struct kvm_vcpu * vcpu,struct kvm_memory_slot * memslot,gfn_t gfn,u64 nr_pages)1686 static int __pkvm_mem_abort_device(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
1687 gfn_t gfn, u64 nr_pages)
1688 {
1689 while (nr_pages--) {
1690 kvm_pfn_t pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
1691 kvm_is_write_fault(vcpu), NULL, NULL);
1692 if (is_error_noslot_pfn(pfn))
1693 return -EFAULT;
1694
1695 if (kvm_is_device_pfn(pfn)) {
1696 int ret = kvm_call_hyp_nvhe(__pkvm_host_map_guest_mmio, pfn, gfn);
1697
1698 if (ret == -EEXIST)
1699 ret = 0; /* We might have raced with another vCPU. */
1700 } else {
1701 /* Release pin from __gfn_to_pfn_memslot(). */
1702 kvm_release_pfn_clean(pfn);
1703 return -EFAULT;
1704 }
1705
1706 gfn++;
1707 }
1708
1709 return 0;
1710 }
1711
1712 /*
1713 * Create a list of kvm_pinned_page based on the array of pages from
1714 * __pkvm_pin_pages in preparation for EL2 mapping.
1715 *
1716 * On success, this function no unpinning is necessary. On error the entire original pages array
1717 * must be unpinned.
1718 */
1719 static int
__pkvm_pages_to_ppages(struct kvm * kvm,struct kvm_memory_slot * memslot,gfn_t gfn,long * __nr_pages,struct page ** pages,struct list_head * ppages)1720 __pkvm_pages_to_ppages(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn,
1721 long *__nr_pages, struct page **pages, struct list_head *ppages)
1722 {
1723 struct list_head ppage_prealloc = LIST_HEAD_INIT(ppage_prealloc);
1724 long nr_ppages = 0, nr_pages = *__nr_pages;
1725 struct kvm_pinned_page *ppage, *tmp;
1726 int p, ret = 0;
1727
1728 /* Pre-allocate kvm_pinned_page before acquiring the mmu_lock */
1729 for (p = 0; p < nr_pages; p++) {
1730 ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
1731 if (!ppage) {
1732 ret = -ENOMEM;
1733 goto err;
1734 }
1735 list_add(&ppage->list_node, &ppage_prealloc);
1736 }
1737
1738 p = 0;
1739 read_lock(&kvm->mmu_lock);
1740 while (p < nr_pages) {
1741 phys_addr_t ipa = gfn << PAGE_SHIFT;
1742 long skip, page_size = PAGE_SIZE;
1743 struct page *page = pages[p];
1744 u64 pfn;
1745
1746 ppage = kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages,
1747 ipa, ipa + PAGE_SIZE - 1);
1748 if (ppage) {
1749 unpin_user_pages(&page, 1);
1750 goto next;
1751 }
1752
1753 pfn = page_to_pfn(page);
1754
1755 if (!kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages,
1756 ALIGN_DOWN(ipa, PMD_SIZE),
1757 ALIGN(ipa + 1, PMD_SIZE) - 1)){
1758 unsigned long hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
1759
1760 page_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &ipa);
1761 }
1762
1763 /* Pop a ppage from the pre-allocated list */
1764 ppage = list_first_entry(&ppage_prealloc, struct kvm_pinned_page, list_node);
1765 list_del_init(&ppage->list_node);
1766
1767 ppage->page = pfn_to_page(pfn);
1768 ppage->ipa = ipa;
1769 ppage->order = get_order(page_size);
1770 list_add_tail(&ppage->list_node, ppages);
1771 nr_ppages += 1 << ppage->order;
1772
1773 next:
1774 /* Number of pages to skip (covered by a THP) */
1775 skip = ppage->order ? ALIGN(gfn + 1, 1 << ppage->order) - gfn - 1 : 0;
1776 if (skip) {
1777 long nr_pins = min_t(long, skip, nr_pages - p - 1);
1778
1779 if (nr_pins >= 1)
1780 unpin_user_pages(&pages[p + 1], nr_pins);
1781 }
1782
1783 p += skip + 1;
1784 gfn += skip + 1;
1785 }
1786 read_unlock(&kvm->mmu_lock);
1787
1788 *__nr_pages = nr_ppages;
1789
1790 err:
1791 /* Free unused pre-allocated kvm_pinned_page */
1792 list_for_each_entry_safe(ppage, tmp, &ppage_prealloc, list_node) {
1793 list_del(&ppage->list_node);
1794 kfree(ppage);
1795 }
1796
1797 return ret;
1798 }
1799
__pkvm_topup_stage2_memcache(struct kvm_vcpu * vcpu,struct list_head * ppages)1800 static int __pkvm_topup_stage2_memcache(struct kvm_vcpu *vcpu, struct list_head *ppages)
1801 {
1802 struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
1803 struct kvm_s2_mmu *mmu = &vcpu->kvm->arch.mmu;
1804 struct kvm_pinned_page *first, *last;
1805 unsigned long nr_stage2_pages;
1806 size_t size;
1807
1808 last = list_last_entry(ppages, struct kvm_pinned_page, list_node);
1809 first = list_first_entry(ppages, struct kvm_pinned_page, list_node);
1810 size = ALIGN(last->ipa + (PAGE_SIZE << last->order), PMD_SIZE) -
1811 ALIGN_DOWN(first->ipa, PMD_SIZE);
1812
1813 /*
1814 * (size n blocks) * (pages to install a stage-2 translation)
1815 *
1816 * Does not take into account possible (but unlikely) discontinuities in
1817 * the ppages list.
1818 */
1819 nr_stage2_pages = (size >> PAGE_SHIFT) / PTRS_PER_PTE;
1820 nr_stage2_pages *= kvm_mmu_cache_min_pages(mmu);
1821
1822 return topup_hyp_memcache_account(vcpu->kvm, hyp_memcache, nr_stage2_pages, 0);
1823 }
1824
__pkvm_host_donate_guest_sglist(struct kvm_vcpu * vcpu,struct list_head * ppages)1825 static int __pkvm_host_donate_guest_sglist(struct kvm_vcpu *vcpu, struct list_head *ppages)
1826 {
1827 struct kvm *kvm = vcpu->kvm;
1828 int ret;
1829
1830 lockdep_assert_held_write(&kvm->mmu_lock);
1831
1832 do {
1833 struct kvm_hyp_pinned_page *hyp_ppage = NULL;
1834 struct kvm_pinned_page *tmp, *ppage;
1835 int p, nr_ppages = 0;
1836
1837 list_for_each_entry(ppage, ppages, list_node) {
1838 u64 pfn = page_to_pfn(ppage->page);
1839 gfn_t gfn = ppage->ipa >> PAGE_SHIFT;
1840
1841 hyp_ppage = next_kvm_hyp_pinned_page(vcpu->arch.hyp_reqs, hyp_ppage, false);
1842 if (!hyp_ppage)
1843 break;
1844
1845 hyp_ppage->pfn = pfn;
1846 hyp_ppage->gfn = gfn;
1847 hyp_ppage->order = ppage->order;
1848 nr_ppages++;
1849
1850 /* Limit the time spent at EL2 */
1851 if (nr_ppages >= (1 << max(pkvm_prefault, 5)))
1852 break;
1853 }
1854
1855 if (hyp_ppage) {
1856 hyp_ppage = next_kvm_hyp_pinned_page(vcpu->arch.hyp_reqs, hyp_ppage, false);
1857 if (hyp_ppage)
1858 hyp_ppage->order = ~((u8)0);
1859 }
1860
1861 ret = kvm_call_hyp_nvhe(__pkvm_host_donate_guest_sglist);
1862 /* See __pkvm_host_donate_guest() -EPERM comment */
1863 if (ret == -EPERM) {
1864 ret = 0;
1865 break;
1866 } else if (ret) {
1867 break;
1868 }
1869
1870 p = 0;
1871 list_for_each_entry_safe(ppage, tmp, ppages, list_node) {
1872 if (p++ >= nr_ppages)
1873 break;
1874
1875 list_del(&ppage->list_node);
1876 ppage->node.rb_right = ppage->node.rb_left = NULL;
1877 WARN_ON(insert_ppage(kvm, ppage));
1878 }
1879 } while (!list_empty(ppages));
1880
1881 return ret;
1882 }
1883
__pkvm_host_donate_guest(struct kvm_vcpu * vcpu,struct list_head * ppages)1884 static int __pkvm_host_donate_guest(struct kvm_vcpu *vcpu, struct list_head *ppages)
1885 {
1886 struct kvm_pinned_page *ppage, *tmp;
1887 struct kvm *kvm = vcpu->kvm;
1888 int ret = -EINVAL; /* Empty list */
1889
1890 write_lock(&kvm->mmu_lock);
1891
1892 if (ppages->next != ppages->prev && kvm->arch.pkvm.enabled) {
1893 ret = __pkvm_host_donate_guest_sglist(vcpu, ppages);
1894 goto unlock;
1895 }
1896
1897 list_for_each_entry_safe(ppage, tmp, ppages, list_node) {
1898 u64 pfn = page_to_pfn(ppage->page);
1899 gfn_t gfn = ppage->ipa >> PAGE_SHIFT;
1900
1901 ret = kvm_call_hyp_nvhe(__pkvm_host_donate_guest, pfn, gfn,
1902 1 << ppage->order);
1903 /*
1904 * Getting -EPERM at this point implies that the pfn has already been
1905 * mapped. This should only ever happen when two vCPUs faulted on the
1906 * same page, and the current one lost the race to do the mapping...
1907 *
1908 * ...or if we've tried to map a region containing an already mapped
1909 * entry.
1910 */
1911 if (ret == -EPERM) {
1912 ret = 0;
1913 continue;
1914 } else if (ret) {
1915 break;
1916 }
1917
1918 list_del(&ppage->list_node);
1919 ppage->node.rb_right = ppage->node.rb_left = NULL;
1920 WARN_ON(insert_ppage(kvm, ppage));
1921
1922 }
1923
1924 unlock:
1925 write_unlock(&kvm->mmu_lock);
1926
1927 return ret;
1928 }
1929
pkvm_mem_abort(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,size_t size,struct kvm_memory_slot * memslot)1930 static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t size,
1931 struct kvm_memory_slot *memslot)
1932 {
1933 gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1934 struct kvm_pinned_page *ppage, *tmp;
1935 struct mm_struct *mm = current->mm;
1936 struct kvm *kvm = vcpu->kvm;
1937 bool account_dec = false;
1938 struct page **pages;
1939 LIST_HEAD(ppages);
1940 long ret, nr_pages;
1941
1942 if (WARN_ON(!kvm->arch.pkvm.enabled))
1943 return -EINVAL;
1944
1945 nr_pages = __pkvm_align_memslot(kvm, memslot, gfn, size);
1946 if (nr_pages < 0)
1947 return nr_pages;
1948
1949 ret = __pkvm_pin_user_pages(kvm, memslot, gfn, nr_pages, &pages);
1950 if (ret == -EHWPOISON)
1951 return 0;
1952 else if (ret == -EREMOTEIO)
1953 return __pkvm_mem_abort_device(vcpu, memslot, gfn, nr_pages);
1954 else if (ret)
1955 return ret;
1956
1957 ret = __pkvm_pages_to_ppages(kvm, memslot, gfn, &nr_pages, pages, &ppages);
1958 if (ret) {
1959 unpin_user_pages(pages, nr_pages);
1960 goto free_pages;
1961 } else if (list_empty(&ppages)) {
1962 ret = 0;
1963 goto free_pages;
1964 }
1965
1966 ret = __pkvm_topup_stage2_memcache(vcpu, &ppages);
1967 if (ret)
1968 goto free_ppages;
1969
1970 ret = account_locked_vm(mm, nr_pages, true);
1971 if (ret)
1972 goto free_ppages;
1973 account_dec = true;
1974
1975 ret = __pkvm_host_donate_guest(vcpu, &ppages);
1976
1977 free_ppages:
1978 /* Pages left in the list haven't been mapped */
1979 list_for_each_entry_safe(ppage, tmp, &ppages, list_node) {
1980 list_del(&ppage->list_node);
1981 unpin_user_pages(&ppage->page, 1);
1982 if (account_dec)
1983 account_locked_vm(mm, 1 << ppage->order, false);
1984 kfree(ppage);
1985 }
1986
1987 free_pages:
1988 kfree(pages);
1989
1990 return ret;
1991 }
1992
pkvm_mem_abort_range(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,size_t size)1993 int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t size)
1994 {
1995 phys_addr_t ipa_end = fault_ipa + size - 1;
1996 struct kvm_memory_slot *memslot;
1997 int idx, err = 0;
1998
1999 if (!PAGE_ALIGNED(size | fault_ipa))
2000 return -EINVAL;
2001
2002 if (ipa_end >= BIT_ULL(get_kvm_ipa_limit()) ||
2003 ipa_end >= kvm_phys_size(vcpu->arch.hw_mmu) ||
2004 ipa_end <= fault_ipa)
2005 return -EINVAL;
2006
2007 idx = srcu_read_lock(&vcpu->kvm->srcu);
2008 memslot = gfn_to_memslot(vcpu->kvm, fault_ipa >> PAGE_SHIFT);
2009 err = pkvm_mem_abort(vcpu, fault_ipa, size, memslot);
2010 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2011
2012 return err;
2013 }
2014
pkvm_mem_abort_prefault(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,struct kvm_memory_slot * memslot)2015 int pkvm_mem_abort_prefault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
2016 struct kvm_memory_slot *memslot)
2017 {
2018 phys_addr_t memslot_start = memslot->base_gfn << PAGE_SHIFT;
2019 size_t size = (1 << pkvm_prefault) << PAGE_SHIFT;
2020
2021 fault_ipa = ALIGN_DOWN(fault_ipa, size);
2022 if (fault_ipa < memslot_start) {
2023 size -= memslot_start - fault_ipa;
2024 fault_ipa = memslot_start;
2025 }
2026
2027 return pkvm_mem_abort(vcpu, fault_ipa, size, memslot);
2028 }
2029
2030 /*
2031 * Splitting is only expected on the back of a relinquish guest HVC in the pKVM case, while
2032 * pkvm_pgtable_stage2_split() can be called with dirty logging.
2033 */
__pkvm_pgtable_stage2_split(struct kvm_vcpu * vcpu,phys_addr_t ipa,size_t size)2034 int __pkvm_pgtable_stage2_split(struct kvm_vcpu *vcpu, phys_addr_t ipa, size_t size)
2035 {
2036 struct list_head ppage_prealloc = LIST_HEAD_INIT(ppage_prealloc);
2037 struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
2038 struct kvm_pinned_page *ppage, *tmp;
2039 struct kvm_memory_slot *memslot;
2040 struct kvm *kvm = vcpu->kvm;
2041 int idx, p, ret, nr_pages;
2042 struct page **pages;
2043 kvm_pfn_t pfn;
2044 gfn_t gfn;
2045
2046 if (WARN_ON(!kvm->arch.pkvm.enabled))
2047 return -EINVAL;
2048
2049 if (!IS_ALIGNED(ipa, PMD_SIZE) || size != PMD_SIZE)
2050 return -EINVAL;
2051
2052 ret = topup_hyp_memcache_account(vcpu->kvm, hyp_memcache, 1, 0);
2053 if (ret)
2054 return ret;
2055
2056 /* We already have 1 pin on the Huge Page */
2057 nr_pages = (size >> PAGE_SHIFT) - 1;
2058 gfn = (ipa >> PAGE_SHIFT) + 1;
2059
2060 /* Pre-allocate kvm_pinned_page before acquiring the mmu_lock */
2061 for (p = 0; p < nr_pages; p++) {
2062 ppage = kzalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
2063 if (!ppage) {
2064 ret = -ENOMEM;
2065 goto free_pinned_pages;
2066 }
2067 list_add(&ppage->list_node, &ppage_prealloc);
2068 }
2069
2070 idx = srcu_read_lock(&vcpu->kvm->srcu);
2071 memslot = gfn_to_memslot(vcpu->kvm, gfn);
2072 ret = __pkvm_pin_user_pages(kvm, memslot, gfn, nr_pages, &pages);
2073 if (ret)
2074 goto unlock_srcu;
2075
2076 write_lock(&kvm->mmu_lock);
2077
2078 ppage = find_ppage(kvm, ipa);
2079 if (!ppage) {
2080 ret = -EPERM;
2081 goto end;
2082 } else if (!ppage->order) {
2083 ret = 0;
2084 goto end;
2085 }
2086
2087 ret = kvm_call_hyp_nvhe(__pkvm_host_split_guest, ipa >> PAGE_SHIFT, size);
2088 if (ret)
2089 goto end;
2090
2091 ppage->order = 0;
2092
2093 pfn = page_to_pfn(ppage->page) + 1;
2094 ipa = ipa + PAGE_SIZE;
2095 while (nr_pages--) {
2096 /* Pop a ppage from the pre-allocated list */
2097 ppage = list_first_entry(&ppage_prealloc, struct kvm_pinned_page, list_node);
2098 list_del_init(&ppage->list_node);
2099
2100 ppage->page = pfn_to_page(pfn);
2101 ppage->ipa = ipa;
2102 ppage->order = 0;
2103 insert_ppage(kvm, ppage);
2104
2105 pfn += 1;
2106 ipa += PAGE_SIZE;
2107 }
2108
2109 end:
2110 write_unlock(&kvm->mmu_lock);
2111
2112 if (ret)
2113 unpin_user_pages(pages, nr_pages);
2114 kfree(pages);
2115
2116 unlock_srcu:
2117 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2118
2119 free_pinned_pages:
2120 /* Free unused pre-allocated kvm_pinned_page */
2121 list_for_each_entry_safe(ppage, tmp, &ppage_prealloc, list_node) {
2122 list_del(&ppage->list_node);
2123 kfree(ppage);
2124 }
2125
2126 return ret;
2127 }
2128
user_mem_abort(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,struct kvm_s2_trans * nested,struct kvm_memory_slot * memslot,bool fault_is_perm)2129 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
2130 struct kvm_s2_trans *nested,
2131 struct kvm_memory_slot *memslot,
2132 bool fault_is_perm)
2133 {
2134 int ret = 0;
2135 bool write_fault, writable, force_pte = false;
2136 bool exec_fault, mte_allowed;
2137 bool device = false, vfio_allow_any_uc = false;
2138 unsigned long mmu_seq;
2139 phys_addr_t ipa = fault_ipa;
2140 struct kvm *kvm = vcpu->kvm;
2141 struct vm_area_struct *vma;
2142 short vma_shift;
2143 gfn_t gfn = fault_ipa >> PAGE_SHIFT;
2144 void *memcache;
2145 kvm_pfn_t pfn;
2146 bool logging_active = memslot_is_logging(memslot);
2147 long vma_pagesize, fault_granule;
2148 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
2149 struct kvm_pgtable *pgt;
2150 unsigned long hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
2151 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
2152
2153 if (fault_is_perm)
2154 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
2155 write_fault = kvm_is_write_fault(vcpu);
2156 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
2157 VM_BUG_ON(write_fault && exec_fault);
2158
2159 if (fault_is_perm && !write_fault && !exec_fault) {
2160 kvm_err("Unexpected L2 read permission error\n");
2161 return -EFAULT;
2162 }
2163
2164 /*
2165 * Permission faults just need to update the existing leaf entry,
2166 * and so normally don't require allocations from the memcache. The
2167 * only exception to this is when dirty logging is enabled at runtime
2168 * and a write fault needs to collapse a block entry into a table.
2169 */
2170 if (!fault_is_perm || (logging_active && write_fault)) {
2171 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
2172
2173 if (!is_protected_kvm_enabled()) {
2174 memcache = &vcpu->arch.mmu_page_cache;
2175 ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
2176 } else {
2177 memcache = &vcpu->arch.stage2_mc;
2178 ret = topup_hyp_memcache_account(kvm, memcache, min_pages, 0);
2179 }
2180 if (ret)
2181 return ret;
2182 }
2183
2184 /*
2185 * Let's check if we will get back a huge page backed by hugetlbfs, or
2186 * get block mapping for device MMIO region.
2187 */
2188 mmap_read_lock(current->mm);
2189 vma = vma_lookup(current->mm, hva);
2190 if (unlikely(!vma)) {
2191 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
2192 mmap_read_unlock(current->mm);
2193 return -EFAULT;
2194 }
2195
2196 /*
2197 * logging_active is guaranteed to never be true for VM_PFNMAP
2198 * memslots.
2199 */
2200 if (logging_active) {
2201 force_pte = true;
2202 vma_shift = PAGE_SHIFT;
2203 } else {
2204 vma_shift = get_vma_page_shift(vma, hva);
2205 }
2206
2207 switch (vma_shift) {
2208 #ifndef __PAGETABLE_PMD_FOLDED
2209 case PUD_SHIFT:
2210 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
2211 break;
2212 fallthrough;
2213 #endif
2214 case CONT_PMD_SHIFT:
2215 vma_shift = PMD_SHIFT;
2216 fallthrough;
2217 case PMD_SHIFT:
2218 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
2219 break;
2220 fallthrough;
2221 case CONT_PTE_SHIFT:
2222 vma_shift = PAGE_SHIFT;
2223 force_pte = true;
2224 fallthrough;
2225 case PAGE_SHIFT:
2226 break;
2227 default:
2228 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
2229 }
2230
2231 vma_pagesize = 1UL << vma_shift;
2232
2233 if (nested) {
2234 unsigned long max_map_size;
2235
2236 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
2237
2238 ipa = kvm_s2_trans_output(nested);
2239
2240 /*
2241 * If we're about to create a shadow stage 2 entry, then we
2242 * can only create a block mapping if the guest stage 2 page
2243 * table uses at least as big a mapping.
2244 */
2245 max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
2246
2247 /*
2248 * Be careful that if the mapping size falls between
2249 * two host sizes, take the smallest of the two.
2250 */
2251 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
2252 max_map_size = PMD_SIZE;
2253 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
2254 max_map_size = PAGE_SIZE;
2255
2256 force_pte = (max_map_size == PAGE_SIZE);
2257 vma_pagesize = min(vma_pagesize, (long)max_map_size);
2258 }
2259
2260 /*
2261 * Both the canonical IPA and fault IPA must be hugepage-aligned to
2262 * ensure we find the right PFN and lay down the mapping in the right
2263 * place.
2264 */
2265 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
2266 fault_ipa &= ~(vma_pagesize - 1);
2267 ipa &= ~(vma_pagesize - 1);
2268 }
2269
2270 gfn = fault_ipa >> PAGE_SHIFT;
2271 mte_allowed = kvm_vma_mte_allowed(vma);
2272
2273 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
2274
2275 /* Don't use the VMA after the unlock -- it may have vanished */
2276 vma = NULL;
2277
2278 /*
2279 * Read mmu_invalidate_seq so that KVM can detect if the results of
2280 * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
2281 * acquiring kvm->mmu_lock.
2282 *
2283 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
2284 * with the smp_wmb() in kvm_mmu_invalidate_end().
2285 */
2286 mmu_seq = vcpu->kvm->mmu_invalidate_seq;
2287 mmap_read_unlock(current->mm);
2288
2289 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
2290 write_fault, &writable, NULL);
2291 if (pfn == KVM_PFN_ERR_HWPOISON) {
2292 kvm_send_hwpoison_signal(hva, vma_shift);
2293 return 0;
2294 }
2295 if (is_error_noslot_pfn(pfn))
2296 return -EFAULT;
2297
2298 if (kvm_is_device_pfn(pfn)) {
2299 /*
2300 * If the page was identified as device early by looking at
2301 * the VMA flags, vma_pagesize is already representing the
2302 * largest quantity we can map. If instead it was mapped
2303 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
2304 * and must not be upgraded.
2305 *
2306 * In both cases, we don't let transparent_hugepage_adjust()
2307 * change things at the last minute.
2308 */
2309 device = true;
2310 } else if (logging_active && !write_fault) {
2311 /*
2312 * Only actually map the page as writable if this was a write
2313 * fault.
2314 */
2315 writable = false;
2316 }
2317
2318 if (exec_fault && device)
2319 return -ENOEXEC;
2320
2321 /*
2322 * Potentially reduce shadow S2 permissions to match the guest's own
2323 * S2. For exec faults, we'd only reach this point if the guest
2324 * actually allowed it (see kvm_s2_handle_perm_fault).
2325 *
2326 * Also encode the level of the original translation in the SW bits
2327 * of the leaf entry as a proxy for the span of that translation.
2328 * This will be retrieved on TLB invalidation from the guest and
2329 * used to limit the invalidation scope if a TTL hint or a range
2330 * isn't provided.
2331 */
2332 if (nested) {
2333 writable &= kvm_s2_trans_writable(nested);
2334 if (!kvm_s2_trans_readable(nested))
2335 prot &= ~KVM_PGTABLE_PROT_R;
2336
2337 prot |= kvm_encode_nested_level(nested);
2338 }
2339
2340 kvm_fault_lock(kvm);
2341 pgt = vcpu->arch.hw_mmu->pgt;
2342 if (mmu_invalidate_retry(kvm, mmu_seq)) {
2343 ret = -EAGAIN;
2344 goto out_unlock;
2345 }
2346
2347 /*
2348 * If we are not forced to use page mapping, check if we are
2349 * backed by a THP and thus use block mapping if possible.
2350 */
2351 if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
2352 if (fault_is_perm && fault_granule > PAGE_SIZE)
2353 vma_pagesize = fault_granule;
2354 else
2355 vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
2356 hva, &pfn,
2357 &fault_ipa);
2358
2359 if (vma_pagesize < 0) {
2360 ret = vma_pagesize;
2361 goto out_unlock;
2362 }
2363 }
2364
2365 if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
2366 /* Check the VMM hasn't introduced a new disallowed VMA */
2367 if (mte_allowed) {
2368 sanitise_mte_tags(kvm, pfn, vma_pagesize);
2369 } else {
2370 ret = -EFAULT;
2371 goto out_unlock;
2372 }
2373 }
2374
2375 if (writable)
2376 prot |= KVM_PGTABLE_PROT_W;
2377
2378 if (exec_fault)
2379 prot |= KVM_PGTABLE_PROT_X;
2380
2381 if (device) {
2382 if (vfio_allow_any_uc)
2383 prot |= KVM_PGTABLE_PROT_NORMAL_NC;
2384 else
2385 prot |= KVM_PGTABLE_PROT_DEVICE;
2386 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
2387 (!nested || kvm_s2_trans_executable(nested))) {
2388 prot |= KVM_PGTABLE_PROT_X;
2389 }
2390
2391 /*
2392 * Under the premise of getting a FSC_PERM fault, we just need to relax
2393 * permissions only if vma_pagesize equals fault_granule. Otherwise,
2394 * kvm_pgtable_stage2_map() should be called to change block size.
2395 */
2396 if (fault_is_perm && vma_pagesize == fault_granule) {
2397 /*
2398 * Drop the SW bits in favour of those stored in the
2399 * PTE, which will be preserved.
2400 */
2401 prot &= ~KVM_NV_GUEST_MAP_SZ;
2402 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
2403 } else {
2404 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
2405 __pfn_to_phys(pfn), prot,
2406 memcache, flags);
2407 }
2408
2409 out_unlock:
2410 kvm_fault_unlock(kvm);
2411
2412 /* Mark the page dirty only if the fault is handled successfully */
2413 if (writable && !ret) {
2414 kvm_set_pfn_dirty(pfn);
2415 mark_page_dirty_in_slot(kvm, memslot, gfn);
2416 }
2417
2418 kvm_release_pfn_clean(pfn);
2419 return ret != -EAGAIN ? ret : 0;
2420 }
2421
2422 /* Resolve the access fault by making the page young again. */
handle_access_fault(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa)2423 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
2424 {
2425 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
2426 kvm_pte_t pte;
2427 struct kvm_s2_mmu *mmu;
2428
2429 trace_kvm_access_fault(fault_ipa);
2430
2431 read_lock(&vcpu->kvm->mmu_lock);
2432 mmu = vcpu->arch.hw_mmu;
2433 pte = KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags);
2434 read_unlock(&vcpu->kvm->mmu_lock);
2435
2436 if (kvm_pte_valid(pte))
2437 kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
2438 }
2439
2440 /**
2441 * kvm_handle_guest_abort - handles all 2nd stage aborts
2442 * @vcpu: the VCPU pointer
2443 *
2444 * Any abort that gets to the host is almost guaranteed to be caused by a
2445 * missing second stage translation table entry, which can mean that either the
2446 * guest simply needs more memory and we must allocate an appropriate page or it
2447 * can mean that the guest tried to access I/O memory, which is emulated by user
2448 * space. The distinction is based on the IPA causing the fault and whether this
2449 * memory region has been registered as standard RAM by user space.
2450 */
kvm_handle_guest_abort(struct kvm_vcpu * vcpu)2451 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
2452 {
2453 struct kvm_s2_trans nested_trans, *nested = NULL;
2454 unsigned long esr;
2455 phys_addr_t fault_ipa; /* The address we faulted on */
2456 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
2457 struct kvm_memory_slot *memslot;
2458 bool is_iabt, write_fault, writable;
2459 gfn_t gfn;
2460 int ret, idx;
2461
2462 esr = kvm_vcpu_get_esr(vcpu);
2463
2464 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
2465 is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
2466
2467 if (esr_fsc_is_translation_fault(esr)) {
2468 /* Beyond sanitised PARange (which is the IPA limit) */
2469 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
2470 kvm_inject_size_fault(vcpu);
2471 return 1;
2472 }
2473
2474 /* Falls between the IPA range and the PARange? */
2475 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) {
2476 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
2477
2478 if (is_iabt)
2479 kvm_inject_pabt(vcpu, fault_ipa);
2480 else
2481 kvm_inject_dabt(vcpu, fault_ipa);
2482 return 1;
2483 }
2484 }
2485
2486 /* Synchronous External Abort? */
2487 if (kvm_vcpu_abt_issea(vcpu)) {
2488 /*
2489 * For RAS the host kernel may handle this abort.
2490 * There is no need to pass the error into the guest.
2491 */
2492 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
2493 kvm_inject_vabt(vcpu);
2494
2495 return 1;
2496 }
2497
2498 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
2499 kvm_vcpu_get_hfar(vcpu), fault_ipa);
2500
2501 /* Check the stage-2 fault is trans. fault or write fault */
2502 if (!esr_fsc_is_translation_fault(esr) &&
2503 !esr_fsc_is_permission_fault(esr) &&
2504 !esr_fsc_is_access_flag_fault(esr)) {
2505 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
2506 kvm_vcpu_trap_get_class(vcpu),
2507 (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
2508 (unsigned long)kvm_vcpu_get_esr(vcpu));
2509 return -EFAULT;
2510 }
2511
2512 idx = srcu_read_lock(&vcpu->kvm->srcu);
2513
2514 /*
2515 * We may have faulted on a shadow stage 2 page table if we are
2516 * running a nested guest. In this case, we have to resolve the L2
2517 * IPA to the L1 IPA first, before knowing what kind of memory should
2518 * back the L1 IPA.
2519 *
2520 * If the shadow stage 2 page table walk faults, then we simply inject
2521 * this to the guest and carry on.
2522 *
2523 * If there are no shadow S2 PTs because S2 is disabled, there is
2524 * nothing to walk and we treat it as a 1:1 before going through the
2525 * canonical translation.
2526 */
2527 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
2528 vcpu->arch.hw_mmu->nested_stage2_enabled) {
2529 u32 esr;
2530
2531 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
2532 if (ret) {
2533 esr = kvm_s2_trans_esr(&nested_trans);
2534 kvm_inject_s2_fault(vcpu, esr);
2535 goto out_unlock;
2536 }
2537
2538 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
2539 if (ret) {
2540 esr = kvm_s2_trans_esr(&nested_trans);
2541 kvm_inject_s2_fault(vcpu, esr);
2542 goto out_unlock;
2543 }
2544
2545 ipa = kvm_s2_trans_output(&nested_trans);
2546 nested = &nested_trans;
2547 }
2548
2549 gfn = ipa >> PAGE_SHIFT;
2550 memslot = gfn_to_memslot_prot(vcpu->kvm, gfn, &writable);
2551 write_fault = kvm_is_write_fault(vcpu);
2552 if (!memslot || (write_fault && !writable)) {
2553 /*
2554 * The guest has put either its instructions or its page-tables
2555 * somewhere it shouldn't have. Userspace won't be able to do
2556 * anything about this (there's no syndrome for a start), so
2557 * re-inject the abort back into the guest.
2558 */
2559 if (is_iabt) {
2560 ret = -ENOEXEC;
2561 goto out;
2562 }
2563
2564 if (kvm_vcpu_abt_iss1tw(vcpu)) {
2565 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2566 ret = 1;
2567 goto out_unlock;
2568 }
2569
2570 /*
2571 * Check for a cache maintenance operation. Since we
2572 * ended-up here, we know it is outside of any memory
2573 * slot. But we can't find out if that is for a device,
2574 * or if the guest is just being stupid. The only thing
2575 * we know for sure is that this range cannot be cached.
2576 *
2577 * So let's assume that the guest is just being
2578 * cautious, and skip the instruction.
2579 */
2580 if (!memslot && kvm_vcpu_dabt_is_cm(vcpu)) {
2581 kvm_incr_pc(vcpu);
2582 ret = 1;
2583 goto out_unlock;
2584 }
2585
2586 /*
2587 * The IPA is reported as [MAX:12], so we need to
2588 * complement it with the bottom 12 bits from the
2589 * faulting VA. This is always 12 bits, irrespective
2590 * of the page size.
2591 */
2592 ipa |= kvm_vcpu_get_hfar(vcpu) & FAR_MASK;
2593 ret = io_mem_abort(vcpu, ipa);
2594 goto out_unlock;
2595 }
2596
2597 /* Userspace should not be able to register out-of-bounds IPAs */
2598 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
2599
2600 if (esr_fsc_is_access_flag_fault(esr)) {
2601 handle_access_fault(vcpu, fault_ipa);
2602 ret = 1;
2603 goto out_unlock;
2604 }
2605
2606 if (kvm_vm_is_protected(vcpu->kvm))
2607 ret = pkvm_mem_abort_prefault(vcpu, fault_ipa, memslot);
2608 else
2609 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot,
2610 esr_fsc_is_permission_fault(esr));
2611
2612 if (ret == 0)
2613 ret = 1;
2614 out:
2615 if (ret == -ENOEXEC) {
2616 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2617 ret = 1;
2618 }
2619 out_unlock:
2620 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2621 return ret;
2622 }
2623
kvm_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)2624 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
2625 {
2626 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
2627 (range->end - range->start) << PAGE_SHIFT,
2628 range->may_block);
2629
2630 kvm_nested_s2_unmap(kvm, range->may_block);
2631 return false;
2632 }
2633
kvm_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)2634 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
2635 {
2636 u64 size = (range->end - range->start) << PAGE_SHIFT;
2637
2638 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
2639 range->start << PAGE_SHIFT,
2640 size, true);
2641 /*
2642 * TODO: Handle nested_mmu structures here using the reverse mapping in
2643 * a later version of patch series.
2644 */
2645 }
2646
kvm_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)2647 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
2648 {
2649 u64 size = (range->end - range->start) << PAGE_SHIFT;
2650
2651 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
2652 range->start << PAGE_SHIFT,
2653 size, false);
2654 }
2655
kvm_mmu_get_httbr(void)2656 phys_addr_t kvm_mmu_get_httbr(void)
2657 {
2658 return __pa(hyp_pgtable->pgd);
2659 }
2660
kvm_get_idmap_vector(void)2661 phys_addr_t kvm_get_idmap_vector(void)
2662 {
2663 return hyp_idmap_vector;
2664 }
2665
kvm_map_idmap_text(void)2666 static int kvm_map_idmap_text(void)
2667 {
2668 unsigned long size = hyp_idmap_end - hyp_idmap_start;
2669 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
2670 PAGE_HYP_EXEC);
2671 if (err)
2672 kvm_err("Failed to idmap %lx-%lx\n",
2673 hyp_idmap_start, hyp_idmap_end);
2674
2675 return err;
2676 }
2677
kvm_hyp_zalloc_page(void * arg)2678 static void *kvm_hyp_zalloc_page(void *arg)
2679 {
2680 return (void *)get_zeroed_page(GFP_KERNEL);
2681 }
2682
2683 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
2684 .zalloc_page = kvm_hyp_zalloc_page,
2685 .get_page = kvm_host_get_page,
2686 .put_page = kvm_host_put_page,
2687 .phys_to_virt = kvm_host_va,
2688 .virt_to_phys = kvm_host_pa,
2689 };
2690
kvm_mmu_init(void)2691 int __init kvm_mmu_init(void)
2692 {
2693 int err;
2694 u32 idmap_bits;
2695 u32 kernel_bits;
2696
2697 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
2698 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2699 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
2700 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2701 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
2702
2703 /*
2704 * We rely on the linker script to ensure at build time that the HYP
2705 * init code does not cross a page boundary.
2706 */
2707 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2708
2709 /*
2710 * The ID map is always configured for 48 bits of translation, which
2711 * may be fewer than the number of VA bits used by the regular kernel
2712 * stage 1, when VA_BITS=52.
2713 *
2714 * At EL2, there is only one TTBR register, and we can't switch between
2715 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
2716 * line: we need to use the extended range with *both* our translation
2717 * tables.
2718 *
2719 * So use the maximum of the idmap VA bits and the regular kernel stage
2720 * 1 VA bits to assure that the hypervisor can both ID map its code page
2721 * and map any kernel memory.
2722 */
2723 idmap_bits = IDMAP_VA_BITS;
2724 kernel_bits = vabits_actual;
2725 hyp_va_bits = max(idmap_bits, kernel_bits);
2726
2727 kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
2728 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2729 kvm_debug("HYP VA range: %lx:%lx\n",
2730 kern_hyp_va(PAGE_OFFSET),
2731 kern_hyp_va((unsigned long)high_memory - 1));
2732
2733 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2734 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
2735 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2736 /*
2737 * The idmap page is intersecting with the VA space,
2738 * it is not safe to continue further.
2739 */
2740 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2741 err = -EINVAL;
2742 goto out;
2743 }
2744
2745 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
2746 if (!hyp_pgtable) {
2747 kvm_err("Hyp mode page-table not allocated\n");
2748 err = -ENOMEM;
2749 goto out;
2750 }
2751
2752 err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits, &kvm_hyp_mm_ops);
2753 if (err)
2754 goto out_free_pgtable;
2755
2756 err = kvm_map_idmap_text();
2757 if (err)
2758 goto out_destroy_pgtable;
2759
2760 io_map_base = hyp_idmap_start;
2761 return 0;
2762
2763 out_destroy_pgtable:
2764 kvm_pgtable_hyp_destroy(hyp_pgtable);
2765 out_free_pgtable:
2766 kfree(hyp_pgtable);
2767 hyp_pgtable = NULL;
2768 out:
2769 return err;
2770 }
2771
kvm_arch_commit_memory_region(struct kvm * kvm,struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)2772 void kvm_arch_commit_memory_region(struct kvm *kvm,
2773 struct kvm_memory_slot *old,
2774 const struct kvm_memory_slot *new,
2775 enum kvm_mr_change change)
2776 {
2777 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
2778
2779 /*
2780 * At this point memslot has been committed and there is an
2781 * allocated dirty_bitmap[], dirty pages will be tracked while the
2782 * memory slot is write protected.
2783 */
2784 if (log_dirty_pages) {
2785
2786 if (change == KVM_MR_DELETE)
2787 return;
2788
2789 /*
2790 * Huge and normal pages are write-protected and split
2791 * on either of these two cases:
2792 *
2793 * 1. with initial-all-set: gradually with CLEAR ioctls,
2794 */
2795 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
2796 return;
2797 /*
2798 * or
2799 * 2. without initial-all-set: all in one shot when
2800 * enabling dirty logging.
2801 */
2802 kvm_mmu_wp_memory_region(kvm, new->id);
2803 kvm_mmu_split_memory_region(kvm, new->id);
2804 } else {
2805 /*
2806 * Free any leftovers from the eager page splitting cache. Do
2807 * this when deleting, moving, disabling dirty logging, or
2808 * creating the memslot (a nop). Doing it for deletes makes
2809 * sure we don't leak memory, and there's no need to keep the
2810 * cache around for any of the other cases.
2811 */
2812 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
2813 }
2814 }
2815
kvm_arch_prepare_memory_region(struct kvm * kvm,const struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)2816 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2817 const struct kvm_memory_slot *old,
2818 struct kvm_memory_slot *new,
2819 enum kvm_mr_change change)
2820 {
2821 hva_t hva, reg_end;
2822 int ret = 0;
2823
2824 if (is_protected_kvm_enabled()) {
2825 if ((change == KVM_MR_DELETE || change == KVM_MR_MOVE) &&
2826 pkvm_is_hyp_created(kvm) && kvm->arch.pkvm.enabled) {
2827 return -EPERM;
2828 }
2829
2830 if (new && kvm->arch.pkvm.enabled &&
2831 new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) {
2832 return -EPERM;
2833 }
2834 }
2835
2836 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2837 change != KVM_MR_FLAGS_ONLY)
2838 return 0;
2839
2840 /*
2841 * Prevent userspace from creating a memory region outside of the IPA
2842 * space addressable by the KVM guest IPA space.
2843 */
2844 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
2845 return -EFAULT;
2846
2847 hva = new->userspace_addr;
2848 reg_end = hva + (new->npages << PAGE_SHIFT);
2849
2850 mmap_read_lock(current->mm);
2851 /*
2852 * A memory region could potentially cover multiple VMAs, and any holes
2853 * between them, so iterate over all of them.
2854 *
2855 * +--------------------------------------------+
2856 * +---------------+----------------+ +----------------+
2857 * | : VMA 1 | VMA 2 | | VMA 3 : |
2858 * +---------------+----------------+ +----------------+
2859 * | memory region |
2860 * +--------------------------------------------+
2861 */
2862 do {
2863 struct vm_area_struct *vma;
2864
2865 vma = find_vma_intersection(current->mm, hva, reg_end);
2866 if (!vma)
2867 break;
2868
2869 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
2870 ret = -EINVAL;
2871 break;
2872 }
2873
2874 if (vma->vm_flags & VM_PFNMAP) {
2875 /* IO region dirty page logging not allowed */
2876 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2877 ret = -EINVAL;
2878 break;
2879 }
2880 }
2881 hva = min(reg_end, vma->vm_end);
2882 } while (hva < reg_end);
2883
2884 mmap_read_unlock(current->mm);
2885 return ret;
2886 }
2887
kvm_arch_free_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)2888 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2889 {
2890 }
2891
kvm_arch_memslots_updated(struct kvm * kvm,u64 gen)2892 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2893 {
2894 }
2895
kvm_arch_flush_shadow_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)2896 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2897 struct kvm_memory_slot *slot)
2898 {
2899 gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2900 phys_addr_t size = slot->npages << PAGE_SHIFT;
2901
2902 write_lock(&kvm->mmu_lock);
2903 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
2904 kvm_nested_s2_unmap(kvm, true);
2905 write_unlock(&kvm->mmu_lock);
2906 }
2907
2908 /*
2909 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2910 *
2911 * Main problems:
2912 * - S/W ops are local to a CPU (not broadcast)
2913 * - We have line migration behind our back (speculation)
2914 * - System caches don't support S/W at all (damn!)
2915 *
2916 * In the face of the above, the best we can do is to try and convert
2917 * S/W ops to VA ops. Because the guest is not allowed to infer the
2918 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2919 * which is a rather good thing for us.
2920 *
2921 * Also, it is only used when turning caches on/off ("The expected
2922 * usage of the cache maintenance instructions that operate by set/way
2923 * is associated with the cache maintenance instructions associated
2924 * with the powerdown and powerup of caches, if this is required by
2925 * the implementation.").
2926 *
2927 * We use the following policy:
2928 *
2929 * - If we trap a S/W operation, we enable VM trapping to detect
2930 * caches being turned on/off, and do a full clean.
2931 *
2932 * - We flush the caches on both caches being turned on and off.
2933 *
2934 * - Once the caches are enabled, we stop trapping VM ops.
2935 */
kvm_set_way_flush(struct kvm_vcpu * vcpu)2936 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2937 {
2938 unsigned long hcr = *vcpu_hcr(vcpu);
2939
2940 /*
2941 * If this is the first time we do a S/W operation
2942 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2943 * VM trapping.
2944 *
2945 * Otherwise, rely on the VM trapping to wait for the MMU +
2946 * Caches to be turned off. At that point, we'll be able to
2947 * clean the caches again.
2948 */
2949 if (!(hcr & HCR_TVM)) {
2950 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2951 vcpu_has_cache_enabled(vcpu));
2952 stage2_flush_vm(vcpu->kvm);
2953 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
2954 }
2955 }
2956
kvm_toggle_cache(struct kvm_vcpu * vcpu,bool was_enabled)2957 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2958 {
2959 bool now_enabled = vcpu_has_cache_enabled(vcpu);
2960
2961 /*
2962 * If switching the MMU+caches on, need to invalidate the caches.
2963 * If switching it off, need to clean the caches.
2964 * Clean + invalidate does the trick always.
2965 */
2966 if (now_enabled != was_enabled)
2967 stage2_flush_vm(vcpu->kvm);
2968
2969 /* Caches are now on, stop trapping VM ops (until a S/W op) */
2970 if (now_enabled)
2971 *vcpu_hcr(vcpu) &= ~HCR_TVM;
2972
2973 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2974 }
2975