• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5  */
6 
7 #include <linux/cma.h>
8 #include <linux/dma-map-ops.h>
9 #include <linux/mman.h>
10 #include <linux/kvm_host.h>
11 #include <linux/io.h>
12 #include <linux/hugetlb.h>
13 #include <linux/interval_tree_generic.h>
14 #include <linux/sched/signal.h>
15 #include <trace/events/kvm.h>
16 #include <asm/pgalloc.h>
17 #include <asm/cacheflush.h>
18 #include <asm/kvm_arm.h>
19 #include <asm/kvm_mmu.h>
20 #include <asm/kvm_pgtable.h>
21 #include <asm/kvm_pkvm.h>
22 #include <asm/kvm_ras.h>
23 #include <asm/kvm_asm.h>
24 #include <asm/kvm_emulate.h>
25 #include <asm/kvm_pkvm.h>
26 #include <asm/virt.h>
27 
28 #include "trace.h"
29 
30 static struct kvm_pgtable *hyp_pgtable;
31 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
32 
33 static unsigned long __ro_after_init hyp_idmap_start;
34 static unsigned long __ro_after_init hyp_idmap_end;
35 static phys_addr_t __ro_after_init hyp_idmap_vector;
36 
37 u32 __ro_after_init hyp_va_bits;
38 
39 static unsigned long __ro_after_init io_map_base;
40 
41 static bool stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot);
42 static bool stage2_pte_is_counted(kvm_pte_t pte, u32 level);
43 
44 static struct kvm_pgtable_pte_ops kvm_s2_pte_ops = {
45 	.force_pte_cb = stage2_force_pte_cb,
46 	.pte_is_counted_cb = stage2_pte_is_counted
47 
48 };
49 
50 #define KVM_PGT_FN(fn)		(!is_protected_kvm_enabled() ? fn : p ## fn)
51 
__stage2_range_addr_end(phys_addr_t addr,phys_addr_t end,phys_addr_t size)52 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
53 					   phys_addr_t size)
54 {
55 	phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
56 
57 	return (boundary - 1 < end - 1) ? boundary : end;
58 }
59 
stage2_range_addr_end(phys_addr_t addr,phys_addr_t end)60 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
61 {
62 	phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
63 
64 	return __stage2_range_addr_end(addr, end, size);
65 }
66 
67 /*
68  * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
69  * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
70  * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
71  * long will also starve other vCPUs.
72  */
stage2_apply_range(struct kvm_s2_mmu * mmu,phys_addr_t addr,phys_addr_t end,int (* fn)(struct kvm_s2_mmu *,u64,u64),bool resched)73 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
74 			      phys_addr_t end,
75 			      int (*fn)(struct kvm_s2_mmu *, u64, u64),
76 			      bool resched)
77 {
78 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
79 	int ret;
80 	u64 next;
81 
82 	do {
83 		next = stage2_range_addr_end(addr, end);
84 		ret = fn(mmu, addr, next - addr);
85 		if (ret)
86 			break;
87 
88 		if (resched && next != end)
89 			cond_resched_rwlock_write(&kvm->mmu_lock);
90 	} while (addr = next, addr != end);
91 
92 	return ret;
93 }
94 
95 #define stage2_apply_range_resched(mmu, addr, end, fn)			\
96 	stage2_apply_range(mmu, addr, end, fn, true)
97 
98 /*
99  * Get the maximum number of page-tables pages needed to split a range
100  * of blocks into PAGE_SIZE PTEs. It assumes the range is already
101  * mapped at level 2, or at level 1 if allowed.
102  */
kvm_mmu_split_nr_page_tables(u64 range)103 static int kvm_mmu_split_nr_page_tables(u64 range)
104 {
105 	int n = 0;
106 
107 	if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
108 		n += DIV_ROUND_UP(range, PUD_SIZE);
109 	n += DIV_ROUND_UP(range, PMD_SIZE);
110 	return n;
111 }
112 
need_split_memcache_topup_or_resched(struct kvm * kvm)113 static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
114 {
115 	struct kvm_mmu_memory_cache *cache;
116 	u64 chunk_size, min;
117 
118 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
119 		return true;
120 
121 	chunk_size = kvm->arch.mmu.split_page_chunk_size;
122 	min = kvm_mmu_split_nr_page_tables(chunk_size);
123 	cache = &kvm->arch.mmu.split_page_cache;
124 	return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
125 }
126 
kvm_mmu_split_huge_pages(struct kvm * kvm,phys_addr_t addr,phys_addr_t end)127 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
128 				    phys_addr_t end)
129 {
130 	struct kvm_mmu_memory_cache *cache;
131 	struct kvm_pgtable *pgt;
132 	int ret, cache_capacity;
133 	u64 next, chunk_size;
134 
135 	lockdep_assert_held_write(&kvm->mmu_lock);
136 
137 	chunk_size = kvm->arch.mmu.split_page_chunk_size;
138 	cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
139 
140 	if (chunk_size == 0)
141 		return 0;
142 
143 	cache = &kvm->arch.mmu.split_page_cache;
144 
145 	do {
146 		if (need_split_memcache_topup_or_resched(kvm)) {
147 			write_unlock(&kvm->mmu_lock);
148 			cond_resched();
149 			/* Eager page splitting is best-effort. */
150 			ret = __kvm_mmu_topup_memory_cache(cache,
151 							   cache_capacity,
152 							   cache_capacity);
153 			write_lock(&kvm->mmu_lock);
154 			if (ret)
155 				break;
156 		}
157 
158 		pgt = kvm->arch.mmu.pgt;
159 		if (!pgt)
160 			return -EINVAL;
161 
162 		next = __stage2_range_addr_end(addr, end, chunk_size);
163 		ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache);
164 		if (ret)
165 			break;
166 	} while (addr = next, addr != end);
167 
168 	return ret;
169 }
170 
memslot_is_logging(struct kvm_memory_slot * memslot)171 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
172 {
173 	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
174 }
175 
176 /**
177  * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8
178  * @kvm:	pointer to kvm structure.
179  *
180  * Interface to HYP function to flush all VM TLB entries
181  */
kvm_arch_flush_remote_tlbs(struct kvm * kvm)182 int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
183 {
184 	if (is_protected_kvm_enabled())
185 		kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
186 	else
187 		kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
188 	return 0;
189 }
190 
kvm_arch_flush_remote_tlbs_range(struct kvm * kvm,gfn_t gfn,u64 nr_pages)191 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
192 				      gfn_t gfn, u64 nr_pages)
193 {
194 	u64 size = nr_pages << PAGE_SHIFT;
195 	u64 addr = gfn << PAGE_SHIFT;
196 
197 	if (is_protected_kvm_enabled())
198 		kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
199 	else
200 		kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size);
201 	return 0;
202 }
203 
kvm_is_device_pfn(unsigned long pfn)204 static bool kvm_is_device_pfn(unsigned long pfn)
205 {
206 	return !pfn_is_map_memory(pfn);
207 }
208 
stage2_memcache_zalloc_page(void * arg)209 static void *stage2_memcache_zalloc_page(void *arg)
210 {
211 	struct kvm_mmu_memory_cache *mc = arg;
212 	void *virt;
213 
214 	/* Allocated with __GFP_ZERO, so no need to zero */
215 	virt = kvm_mmu_memory_cache_alloc(mc);
216 	if (virt)
217 		kvm_account_pgtable_pages(virt, 1);
218 	return virt;
219 }
220 
kvm_host_zalloc_pages_exact(size_t size)221 static void *kvm_host_zalloc_pages_exact(size_t size)
222 {
223 	return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
224 }
225 
kvm_s2_zalloc_pages_exact(size_t size)226 static void *kvm_s2_zalloc_pages_exact(size_t size)
227 {
228 	void *virt = kvm_host_zalloc_pages_exact(size);
229 
230 	if (virt)
231 		kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT));
232 	return virt;
233 }
234 
kvm_s2_free_pages_exact(void * virt,size_t size)235 static void kvm_s2_free_pages_exact(void *virt, size_t size)
236 {
237 	kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT));
238 	free_pages_exact(virt, size);
239 }
240 
241 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
242 
stage2_free_unlinked_table_rcu_cb(struct rcu_head * head)243 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
244 {
245 	struct page *page = container_of(head, struct page, rcu_head);
246 	void *pgtable = page_to_virt(page);
247 	s8 level = page_private(page);
248 
249 	KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, &kvm_s2_pte_ops,
250 						     pgtable, level);
251 }
252 
stage2_free_unlinked_table(void * addr,s8 level)253 static void stage2_free_unlinked_table(void *addr, s8 level)
254 {
255 	struct page *page = virt_to_page(addr);
256 
257 	set_page_private(page, (unsigned long)level);
258 	call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
259 }
260 
kvm_host_get_page(void * addr)261 static void kvm_host_get_page(void *addr)
262 {
263 	get_page(virt_to_page(addr));
264 }
265 
kvm_host_put_page(void * addr)266 static void kvm_host_put_page(void *addr)
267 {
268 	put_page(virt_to_page(addr));
269 }
270 
kvm_s2_put_page(void * addr)271 static void kvm_s2_put_page(void *addr)
272 {
273 	struct page *p = virt_to_page(addr);
274 	/* Dropping last refcount, the page will be freed */
275 	if (page_count(p) == 1)
276 		kvm_account_pgtable_pages(addr, -1);
277 	put_page(p);
278 }
279 
kvm_host_page_count(void * addr)280 static int kvm_host_page_count(void *addr)
281 {
282 	return page_count(virt_to_page(addr));
283 }
284 
clean_dcache_guest_page(void * va,size_t size)285 static void clean_dcache_guest_page(void *va, size_t size)
286 {
287 	__clean_dcache_guest_page(va, size);
288 }
289 
invalidate_icache_guest_page(void * va,size_t size)290 static void invalidate_icache_guest_page(void *va, size_t size)
291 {
292 	__invalidate_icache_guest_page(va, size);
293 }
294 
__pinned_page_start(struct kvm_pinned_page * ppage)295 static u64 __pinned_page_start(struct kvm_pinned_page *ppage)
296 {
297 	return ppage->ipa;
298 }
299 
__pinned_page_end(struct kvm_pinned_page * ppage)300 static u64 __pinned_page_end(struct kvm_pinned_page *ppage)
301 {
302 	return ppage->ipa + (1 << (ppage->order + PAGE_SHIFT)) - 1;
303 }
304 
305 INTERVAL_TREE_DEFINE(struct kvm_pinned_page, node, u64, __subtree_last,
306 		     __pinned_page_start, __pinned_page_end, /* empty */,
307 		     kvm_pinned_pages);
308 
309 #define for_ppage_node_in_range(kvm, start, end, __ppage, __tmp)				\
310 	for (__ppage = kvm_pinned_pages_iter_first(&(kvm)->arch.pkvm.pinned_pages, start, end - 1);\
311 	     __ppage && ({ __tmp = kvm_pinned_pages_iter_next(__ppage, start, end - 1); 1; });	\
312 	     __ppage = __tmp)
313 
314 /*
315  * Unmapping vs dcache management:
316  *
317  * If a guest maps certain memory pages as uncached, all writes will
318  * bypass the data cache and go directly to RAM.  However, the CPUs
319  * can still speculate reads (not writes) and fill cache lines with
320  * data.
321  *
322  * Those cache lines will be *clean* cache lines though, so a
323  * clean+invalidate operation is equivalent to an invalidate
324  * operation, because no cache lines are marked dirty.
325  *
326  * Those clean cache lines could be filled prior to an uncached write
327  * by the guest, and the cache coherent IO subsystem would therefore
328  * end up writing old data to disk.
329  *
330  * This is why right after unmapping a page/section and invalidating
331  * the corresponding TLBs, we flush to make sure the IO subsystem will
332  * never hit in the cache.
333  *
334  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
335  * we then fully enforce cacheability of RAM, no matter what the guest
336  * does.
337  */
338 /**
339  * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range
340  * @mmu:   The KVM stage-2 MMU pointer
341  * @start: The intermediate physical base address of the range to unmap
342  * @size:  The size of the area to unmap
343  * @may_block: Whether or not we are permitted to block
344  *
345  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
346  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
347  * destroying the VM), otherwise another faulting VCPU may come in and mess
348  * with things behind our backs.
349  */
350 
___unmap_stage2_range(struct kvm_s2_mmu * mmu,u64 addr,u64 size)351 static int ___unmap_stage2_range(struct kvm_s2_mmu *mmu, u64 addr, u64 size)
352 {
353 	return KVM_PGT_FN(kvm_pgtable_stage2_unmap)(mmu->pgt, addr, size);
354 }
355 
__unmap_stage2_range(struct kvm_s2_mmu * mmu,phys_addr_t start,u64 size,bool may_block)356 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
357 				 bool may_block)
358 {
359 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
360 	phys_addr_t end = start + size;
361 
362 	if (is_protected_kvm_enabled() && kvm->arch.pkvm.enabled)
363 		return;
364 
365 	lockdep_assert_held_write(&kvm->mmu_lock);
366 	WARN_ON(size & ~PAGE_MASK);
367 	WARN_ON(stage2_apply_range(mmu, start, end, ___unmap_stage2_range, may_block));
368 }
369 
kvm_stage2_unmap_range(struct kvm_s2_mmu * mmu,phys_addr_t start,u64 size,bool may_block)370 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
371 			    u64 size, bool may_block)
372 {
373 	__unmap_stage2_range(mmu, start, size, may_block);
374 }
375 
__stage2_flush_range(struct kvm_s2_mmu * mmu,u64 addr,u64 size)376 static int __stage2_flush_range(struct kvm_s2_mmu *mmu, u64 addr, u64 size)
377 {
378 	return KVM_PGT_FN(kvm_pgtable_stage2_flush)(mmu->pgt, addr, size);
379 }
380 
kvm_stage2_flush_range(struct kvm_s2_mmu * mmu,phys_addr_t addr,phys_addr_t end)381 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
382 {
383 	stage2_apply_range_resched(mmu, addr, end, __stage2_flush_range);
384 }
385 
stage2_flush_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)386 static void stage2_flush_memslot(struct kvm *kvm,
387 				 struct kvm_memory_slot *memslot)
388 {
389 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
390 	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
391 
392 	kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
393 }
394 
395 /**
396  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
397  * @kvm: The struct kvm pointer
398  *
399  * Go through the stage 2 page tables and invalidate any cache lines
400  * backing memory already mapped to the VM.
401  */
stage2_flush_vm(struct kvm * kvm)402 static void stage2_flush_vm(struct kvm *kvm)
403 {
404 	struct kvm_memslots *slots;
405 	struct kvm_memory_slot *memslot;
406 	int idx, bkt;
407 
408 	idx = srcu_read_lock(&kvm->srcu);
409 	write_lock(&kvm->mmu_lock);
410 
411 	slots = kvm_memslots(kvm);
412 	kvm_for_each_memslot(memslot, bkt, slots)
413 		stage2_flush_memslot(kvm, memslot);
414 
415 	kvm_nested_s2_flush(kvm);
416 
417 	write_unlock(&kvm->mmu_lock);
418 	srcu_read_unlock(&kvm->srcu, idx);
419 }
420 
421 /**
422  * free_hyp_pgds - free Hyp-mode page tables
423  */
free_hyp_pgds(void)424 void __init free_hyp_pgds(void)
425 {
426 	mutex_lock(&kvm_hyp_pgd_mutex);
427 	if (hyp_pgtable) {
428 		kvm_pgtable_hyp_destroy(hyp_pgtable);
429 		kfree(hyp_pgtable);
430 		hyp_pgtable = NULL;
431 	}
432 	mutex_unlock(&kvm_hyp_pgd_mutex);
433 }
434 
kvm_host_owns_hyp_mappings(void)435 static bool kvm_host_owns_hyp_mappings(void)
436 {
437 	if (is_kernel_in_hyp_mode())
438 		return false;
439 
440 	if (static_branch_likely(&kvm_protected_mode_initialized))
441 		return false;
442 
443 	/*
444 	 * This can happen at boot time when __create_hyp_mappings() is called
445 	 * after the hyp protection has been enabled, but the static key has
446 	 * not been flipped yet.
447 	 */
448 	if (!hyp_pgtable && is_protected_kvm_enabled())
449 		return false;
450 
451 	WARN_ON(!hyp_pgtable);
452 
453 	return true;
454 }
455 
__create_hyp_mappings(unsigned long start,unsigned long size,unsigned long phys,enum kvm_pgtable_prot prot)456 int __create_hyp_mappings(unsigned long start, unsigned long size,
457 			  unsigned long phys, enum kvm_pgtable_prot prot)
458 {
459 	int err;
460 
461 	if (WARN_ON(!kvm_host_owns_hyp_mappings()))
462 		return -EINVAL;
463 
464 	mutex_lock(&kvm_hyp_pgd_mutex);
465 	err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
466 	mutex_unlock(&kvm_hyp_pgd_mutex);
467 
468 	return err;
469 }
470 
kvm_kaddr_to_phys(void * kaddr)471 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
472 {
473 	if (!is_vmalloc_addr(kaddr)) {
474 		BUG_ON(!virt_addr_valid(kaddr));
475 		return __pa(kaddr);
476 	} else {
477 		return page_to_phys(vmalloc_to_page(kaddr)) +
478 		       offset_in_page(kaddr);
479 	}
480 }
481 
482 struct hyp_shared_pfn {
483 	u64 pfn;
484 	int count;
485 	struct rb_node node;
486 };
487 
488 static DEFINE_MUTEX(hyp_shared_pfns_lock);
489 static struct rb_root hyp_shared_pfns = RB_ROOT;
490 
find_shared_pfn(u64 pfn,struct rb_node *** node,struct rb_node ** parent)491 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node,
492 					      struct rb_node **parent)
493 {
494 	struct hyp_shared_pfn *this;
495 
496 	*node = &hyp_shared_pfns.rb_node;
497 	*parent = NULL;
498 	while (**node) {
499 		this = container_of(**node, struct hyp_shared_pfn, node);
500 		*parent = **node;
501 		if (this->pfn < pfn)
502 			*node = &((**node)->rb_left);
503 		else if (this->pfn > pfn)
504 			*node = &((**node)->rb_right);
505 		else
506 			return this;
507 	}
508 
509 	return NULL;
510 }
511 
share_pfn_hyp(u64 pfn)512 static int share_pfn_hyp(u64 pfn)
513 {
514 	struct rb_node **node, *parent;
515 	struct hyp_shared_pfn *this;
516 	int ret = 0;
517 
518 	mutex_lock(&hyp_shared_pfns_lock);
519 	this = find_shared_pfn(pfn, &node, &parent);
520 	if (this) {
521 		this->count++;
522 		goto unlock;
523 	}
524 
525 	this = kzalloc(sizeof(*this), GFP_KERNEL);
526 	if (!this) {
527 		ret = -ENOMEM;
528 		goto unlock;
529 	}
530 
531 	this->pfn = pfn;
532 	this->count = 1;
533 	rb_link_node(&this->node, parent, node);
534 	rb_insert_color(&this->node, &hyp_shared_pfns);
535 	ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1);
536 unlock:
537 	mutex_unlock(&hyp_shared_pfns_lock);
538 
539 	return ret;
540 }
541 
unshare_pfn_hyp(u64 pfn)542 static int unshare_pfn_hyp(u64 pfn)
543 {
544 	struct rb_node **node, *parent;
545 	struct hyp_shared_pfn *this;
546 	int ret = 0;
547 
548 	mutex_lock(&hyp_shared_pfns_lock);
549 	this = find_shared_pfn(pfn, &node, &parent);
550 	if (WARN_ON(!this)) {
551 		ret = -ENOENT;
552 		goto unlock;
553 	}
554 
555 	this->count--;
556 	if (this->count)
557 		goto unlock;
558 
559 	rb_erase(&this->node, &hyp_shared_pfns);
560 	kfree(this);
561 	ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1);
562 unlock:
563 	mutex_unlock(&hyp_shared_pfns_lock);
564 
565 	return ret;
566 }
567 
kvm_share_hyp(void * from,void * to)568 int kvm_share_hyp(void *from, void *to)
569 {
570 	phys_addr_t start, end, cur;
571 	u64 pfn;
572 	int ret;
573 
574 	if (is_kernel_in_hyp_mode())
575 		return 0;
576 
577 	/*
578 	 * The share hcall maps things in the 'fixed-offset' region of the hyp
579 	 * VA space, so we can only share physically contiguous data-structures
580 	 * for now.
581 	 */
582 	if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to))
583 		return -EINVAL;
584 
585 	if (kvm_host_owns_hyp_mappings())
586 		return create_hyp_mappings(from, to, PAGE_HYP);
587 
588 	start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
589 	end = PAGE_ALIGN(__pa(to));
590 	for (cur = start; cur < end; cur += PAGE_SIZE) {
591 		pfn = __phys_to_pfn(cur);
592 		ret = share_pfn_hyp(pfn);
593 		if (ret)
594 			return ret;
595 	}
596 
597 	return 0;
598 }
599 
kvm_unshare_hyp(void * from,void * to)600 void kvm_unshare_hyp(void *from, void *to)
601 {
602 	phys_addr_t start, end, cur;
603 	u64 pfn;
604 
605 	if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from)
606 		return;
607 
608 	start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
609 	end = PAGE_ALIGN(__pa(to));
610 	for (cur = start; cur < end; cur += PAGE_SIZE) {
611 		pfn = __phys_to_pfn(cur);
612 		WARN_ON(unshare_pfn_hyp(pfn));
613 	}
614 }
615 
616 /**
617  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
618  * @from:	The virtual kernel start address of the range
619  * @to:		The virtual kernel end address of the range (exclusive)
620  * @prot:	The protection to be applied to this range
621  *
622  * The same virtual address as the kernel virtual address is also used
623  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
624  * physical pages.
625  */
create_hyp_mappings(void * from,void * to,enum kvm_pgtable_prot prot)626 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
627 {
628 	phys_addr_t phys_addr;
629 	unsigned long virt_addr;
630 	unsigned long start = kern_hyp_va((unsigned long)from);
631 	unsigned long end = kern_hyp_va((unsigned long)to);
632 
633 	if (is_kernel_in_hyp_mode())
634 		return 0;
635 
636 	if (!kvm_host_owns_hyp_mappings())
637 		return -EPERM;
638 
639 	start = start & PAGE_MASK;
640 	end = PAGE_ALIGN(end);
641 
642 	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
643 		int err;
644 
645 		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
646 		err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
647 					    prot);
648 		if (err)
649 			return err;
650 	}
651 
652 	return 0;
653 }
654 
__hyp_alloc_private_va_range(unsigned long base)655 static int __hyp_alloc_private_va_range(unsigned long base)
656 {
657 	lockdep_assert_held(&kvm_hyp_pgd_mutex);
658 
659 	if (!PAGE_ALIGNED(base))
660 		return -EINVAL;
661 
662 	/*
663 	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
664 	 * allocating the new area, as it would indicate we've
665 	 * overflowed the idmap/IO address range.
666 	 */
667 	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
668 		return -ENOMEM;
669 
670 	io_map_base = base;
671 
672 	return 0;
673 }
674 
675 /**
676  * hyp_alloc_private_va_range - Allocates a private VA range.
677  * @size:	The size of the VA range to reserve.
678  * @haddr:	The hypervisor virtual start address of the allocation.
679  *
680  * The private virtual address (VA) range is allocated below io_map_base
681  * and aligned based on the order of @size.
682  *
683  * Return: 0 on success or negative error code on failure.
684  */
hyp_alloc_private_va_range(size_t size,unsigned long * haddr)685 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
686 {
687 	unsigned long base;
688 	int ret = 0;
689 
690 	mutex_lock(&kvm_hyp_pgd_mutex);
691 
692 	/*
693 	 * This assumes that we have enough space below the idmap
694 	 * page to allocate our VAs. If not, the check in
695 	 * __hyp_alloc_private_va_range() will kick. A potential
696 	 * alternative would be to detect that overflow and switch
697 	 * to an allocation above the idmap.
698 	 *
699 	 * The allocated size is always a multiple of PAGE_SIZE.
700 	 */
701 	size = PAGE_ALIGN(size);
702 	base = io_map_base - size;
703 	ret = __hyp_alloc_private_va_range(base);
704 
705 	mutex_unlock(&kvm_hyp_pgd_mutex);
706 
707 	if (!ret)
708 		*haddr = base;
709 
710 	return ret;
711 }
712 
__create_hyp_private_mapping(phys_addr_t phys_addr,size_t size,unsigned long * haddr,enum kvm_pgtable_prot prot)713 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
714 					unsigned long *haddr,
715 					enum kvm_pgtable_prot prot)
716 {
717 	unsigned long addr;
718 	int ret = 0;
719 
720 	if (!kvm_host_owns_hyp_mappings()) {
721 		addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
722 					 phys_addr, size, prot);
723 		if (IS_ERR_VALUE(addr))
724 			return addr;
725 		*haddr = addr;
726 
727 		return 0;
728 	}
729 
730 	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
731 	ret = hyp_alloc_private_va_range(size, &addr);
732 	if (ret)
733 		return ret;
734 
735 	ret = __create_hyp_mappings(addr, size, phys_addr, prot);
736 	if (ret)
737 		return ret;
738 
739 	*haddr = addr + offset_in_page(phys_addr);
740 	return ret;
741 }
742 
create_hyp_stack(phys_addr_t phys_addr,unsigned long * haddr)743 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
744 {
745 	unsigned long base;
746 	size_t size;
747 	int ret;
748 
749 	mutex_lock(&kvm_hyp_pgd_mutex);
750 	/*
751 	 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies
752 	 * an alignment of our allocation on the order of the size.
753 	 */
754 	size = NVHE_STACK_SIZE * 2;
755 	base = ALIGN_DOWN(io_map_base - size, size);
756 
757 	ret = __hyp_alloc_private_va_range(base);
758 
759 	mutex_unlock(&kvm_hyp_pgd_mutex);
760 
761 	if (ret) {
762 		kvm_err("Cannot allocate hyp stack guard page\n");
763 		return ret;
764 	}
765 
766 	/*
767 	 * Since the stack grows downwards, map the stack to the page
768 	 * at the higher address and leave the lower guard page
769 	 * unbacked.
770 	 *
771 	 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1
772 	 * and addresses corresponding to the guard page have the
773 	 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection.
774 	 */
775 	ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE,
776 				    phys_addr, PAGE_HYP);
777 	if (ret)
778 		kvm_err("Cannot map hyp stack\n");
779 
780 	*haddr = base + size;
781 
782 	return ret;
783 }
784 
785 /**
786  * create_hyp_io_mappings - Map IO into both kernel and HYP
787  * @phys_addr:	The physical start address which gets mapped
788  * @size:	Size of the region being mapped
789  * @kaddr:	Kernel VA for this mapping
790  * @haddr:	HYP VA for this mapping
791  */
create_hyp_io_mappings(phys_addr_t phys_addr,size_t size,void __iomem ** kaddr,void __iomem ** haddr)792 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
793 			   void __iomem **kaddr,
794 			   void __iomem **haddr)
795 {
796 	unsigned long addr;
797 	int ret;
798 
799 	if (is_protected_kvm_enabled())
800 		return -EPERM;
801 
802 	*kaddr = ioremap(phys_addr, size);
803 	if (!*kaddr)
804 		return -ENOMEM;
805 
806 	if (is_kernel_in_hyp_mode()) {
807 		*haddr = *kaddr;
808 		return 0;
809 	}
810 
811 	ret = __create_hyp_private_mapping(phys_addr, size,
812 					   &addr, PAGE_HYP_DEVICE);
813 	if (ret) {
814 		iounmap(*kaddr);
815 		*kaddr = NULL;
816 		*haddr = NULL;
817 		return ret;
818 	}
819 
820 	*haddr = (void __iomem *)addr;
821 	return 0;
822 }
823 
824 /**
825  * create_hyp_exec_mappings - Map an executable range into HYP
826  * @phys_addr:	The physical start address which gets mapped
827  * @size:	Size of the region being mapped
828  * @haddr:	HYP VA for this mapping
829  */
create_hyp_exec_mappings(phys_addr_t phys_addr,size_t size,void ** haddr)830 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
831 			     void **haddr)
832 {
833 	unsigned long addr;
834 	int ret;
835 
836 	BUG_ON(is_kernel_in_hyp_mode());
837 
838 	ret = __create_hyp_private_mapping(phys_addr, size,
839 					   &addr, PAGE_HYP_EXEC);
840 	if (ret) {
841 		*haddr = NULL;
842 		return ret;
843 	}
844 
845 	*haddr = (void *)addr;
846 	return 0;
847 }
848 
849 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
850 	/* We shouldn't need any other callback to walk the PT */
851 	.phys_to_virt		= kvm_host_va,
852 };
853 
get_user_mapping_size(struct kvm * kvm,u64 addr)854 static int get_user_mapping_size(struct kvm *kvm, u64 addr)
855 {
856 	struct kvm_pgtable pgt = {
857 		.pgd		= (kvm_pteref_t)kvm->mm->pgd,
858 		.ia_bits	= vabits_actual,
859 		.start_level	= (KVM_PGTABLE_LAST_LEVEL -
860 				   ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1),
861 		.mm_ops		= &kvm_user_mm_ops,
862 	};
863 	unsigned long flags;
864 	kvm_pte_t pte = 0;	/* Keep GCC quiet... */
865 	s8 level = S8_MAX;
866 	int ret;
867 
868 	/*
869 	 * Disable IRQs so that we hazard against a concurrent
870 	 * teardown of the userspace page tables (which relies on
871 	 * IPI-ing threads).
872 	 */
873 	local_irq_save(flags);
874 	ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
875 	local_irq_restore(flags);
876 
877 	if (ret)
878 		return ret;
879 
880 	/*
881 	 * Not seeing an error, but not updating level? Something went
882 	 * deeply wrong...
883 	 */
884 	if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL))
885 		return -EFAULT;
886 	if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL))
887 		return -EFAULT;
888 
889 	/* Oops, the userspace PTs are gone... Replay the fault */
890 	if (!kvm_pte_valid(pte))
891 		return -EAGAIN;
892 
893 	return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
894 }
895 
stage2_force_pte_cb(u64 addr,u64 end,enum kvm_pgtable_prot prot)896 static bool stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
897 {
898 	return false;
899 }
900 
stage2_pte_is_counted(kvm_pte_t pte,u32 level)901 static bool stage2_pte_is_counted(kvm_pte_t pte, u32 level)
902 
903 {
904 	return !!pte;
905 }
906 
907 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
908 	.zalloc_page		= stage2_memcache_zalloc_page,
909 	.zalloc_pages_exact	= kvm_s2_zalloc_pages_exact,
910 	.free_pages_exact	= kvm_s2_free_pages_exact,
911 	.free_unlinked_table	= stage2_free_unlinked_table,
912 	.get_page		= kvm_host_get_page,
913 	.put_page		= kvm_s2_put_page,
914 	.page_count		= kvm_host_page_count,
915 	.phys_to_virt		= kvm_host_va,
916 	.virt_to_phys		= kvm_host_pa,
917 	.dcache_clean_inval_poc	= clean_dcache_guest_page,
918 	.icache_inval_pou	= invalidate_icache_guest_page,
919 };
920 
kvm_init_ipa_range(struct kvm_s2_mmu * mmu,unsigned long type)921 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
922 {
923 	u32 kvm_ipa_limit = get_kvm_ipa_limit();
924 	u64 mmfr0, mmfr1;
925 	u32 phys_shift;
926 
927 	phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
928 	if (is_protected_kvm_enabled()) {
929 		phys_shift = kvm_ipa_limit;
930 	} else if (phys_shift) {
931 		if (phys_shift > kvm_ipa_limit ||
932 		    phys_shift < ARM64_MIN_PARANGE_BITS)
933 			return -EINVAL;
934 	} else {
935 		phys_shift = KVM_PHYS_SHIFT;
936 		if (phys_shift > kvm_ipa_limit) {
937 			pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
938 				     current->comm);
939 			return -EINVAL;
940 		}
941 	}
942 
943 	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
944 	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
945 	mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
946 
947 	return 0;
948 }
949 
950 /**
951  * kvm_init_stage2_mmu - Initialise a S2 MMU structure
952  * @kvm:	The pointer to the KVM structure
953  * @mmu:	The pointer to the s2 MMU structure
954  * @type:	The machine type of the virtual machine
955  *
956  * Allocates only the stage-2 HW PGD level table(s).
957  * Note we don't need locking here as this is only called in two cases:
958  *
959  * - when the VM is created, which can't race against anything
960  *
961  * - when secondary kvm_s2_mmu structures are initialised for NV
962  *   guests, and the caller must hold kvm->lock as this is called on a
963  *   per-vcpu basis.
964  */
kvm_init_stage2_mmu(struct kvm * kvm,struct kvm_s2_mmu * mmu,unsigned long type)965 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
966 {
967 	int cpu, err;
968 	struct kvm_pgtable *pgt;
969 
970 	kvm->arch.pkvm.pinned_pages = RB_ROOT_CACHED;
971 
972 	/*
973 	 * If we already have our page tables in place, and that the
974 	 * MMU context is the canonical one, we have a bug somewhere,
975 	 * as this is only supposed to ever happen once per VM.
976 	 *
977 	 * Otherwise, we're building nested page tables, and that's
978 	 * probably because userspace called KVM_ARM_VCPU_INIT more
979 	 * than once on the same vcpu. Since that's actually legal,
980 	 * don't kick a fuss and leave gracefully.
981 	 */
982 	if (mmu->pgt != NULL) {
983 		if (kvm_is_nested_s2_mmu(kvm, mmu))
984 			return 0;
985 
986 		kvm_err("kvm_arch already initialized?\n");
987 		return -EINVAL;
988 	}
989 
990 	err = kvm_init_ipa_range(mmu, type);
991 	if (err)
992 		return err;
993 
994 	pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
995 	if (!pgt)
996 		return -ENOMEM;
997 
998 	mmu->arch = &kvm->arch;
999 	err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops, &kvm_s2_pte_ops);
1000 	if (err)
1001 		goto out_free_pgtable;
1002 
1003 	mmu->pgt = pgt;
1004 	if (is_protected_kvm_enabled())
1005 		return 0;
1006 
1007 	mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
1008 	if (!mmu->last_vcpu_ran) {
1009 		err = -ENOMEM;
1010 		goto out_destroy_pgtable;
1011 	}
1012 
1013 	for_each_possible_cpu(cpu)
1014 		*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
1015 
1016 	 /* The eager page splitting is disabled by default */
1017 	mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
1018 	mmu->split_page_cache.gfp_zero = __GFP_ZERO;
1019 
1020 	mmu->pgd_phys = __pa(pgt->pgd);
1021 
1022 	if (kvm_is_nested_s2_mmu(kvm, mmu))
1023 		kvm_init_nested_s2_mmu(mmu);
1024 
1025 	return 0;
1026 
1027 out_destroy_pgtable:
1028 	KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
1029 out_free_pgtable:
1030 	kfree(pgt);
1031 	return err;
1032 }
1033 
kvm_uninit_stage2_mmu(struct kvm * kvm)1034 void kvm_uninit_stage2_mmu(struct kvm *kvm)
1035 {
1036 	write_lock(&kvm->mmu_lock);
1037 	kvm_stage2_unmap_range(&kvm->arch.mmu, 0, BIT(VTCR_EL2_IPA(kvm->arch.mmu.vtcr)), true);
1038 	write_unlock(&kvm->mmu_lock);
1039 	kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
1040 }
1041 
stage2_unmap_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)1042 static void stage2_unmap_memslot(struct kvm *kvm,
1043 				 struct kvm_memory_slot *memslot)
1044 {
1045 	hva_t hva = memslot->userspace_addr;
1046 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
1047 	phys_addr_t size = PAGE_SIZE * memslot->npages;
1048 	hva_t reg_end = hva + size;
1049 
1050 	/*
1051 	 * A memory region could potentially cover multiple VMAs, and any holes
1052 	 * between them, so iterate over all of them to find out if we should
1053 	 * unmap any of them.
1054 	 *
1055 	 *     +--------------------------------------------+
1056 	 * +---------------+----------------+   +----------------+
1057 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
1058 	 * +---------------+----------------+   +----------------+
1059 	 *     |               memory region                |
1060 	 *     +--------------------------------------------+
1061 	 */
1062 	do {
1063 		struct vm_area_struct *vma;
1064 		hva_t vm_start, vm_end;
1065 
1066 		vma = find_vma_intersection(current->mm, hva, reg_end);
1067 		if (!vma)
1068 			break;
1069 
1070 		/*
1071 		 * Take the intersection of this VMA with the memory region
1072 		 */
1073 		vm_start = max(hva, vma->vm_start);
1074 		vm_end = min(reg_end, vma->vm_end);
1075 
1076 		if (!(vma->vm_flags & VM_PFNMAP)) {
1077 			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
1078 			kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true);
1079 		}
1080 		hva = vm_end;
1081 	} while (hva < reg_end);
1082 }
1083 
1084 /**
1085  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
1086  * @kvm: The struct kvm pointer
1087  *
1088  * Go through the memregions and unmap any regular RAM
1089  * backing memory already mapped to the VM.
1090  */
stage2_unmap_vm(struct kvm * kvm)1091 void stage2_unmap_vm(struct kvm *kvm)
1092 {
1093 	struct kvm_memslots *slots;
1094 	struct kvm_memory_slot *memslot;
1095 	int idx, bkt;
1096 
1097 	idx = srcu_read_lock(&kvm->srcu);
1098 	mmap_read_lock(current->mm);
1099 	write_lock(&kvm->mmu_lock);
1100 
1101 	slots = kvm_memslots(kvm);
1102 	kvm_for_each_memslot(memslot, bkt, slots)
1103 		stage2_unmap_memslot(kvm, memslot);
1104 
1105 	kvm_nested_s2_unmap(kvm, true);
1106 
1107 	write_unlock(&kvm->mmu_lock);
1108 	mmap_read_unlock(current->mm);
1109 	srcu_read_unlock(&kvm->srcu, idx);
1110 }
1111 
kvm_free_stage2_pgd(struct kvm_s2_mmu * mmu)1112 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
1113 {
1114 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
1115 	struct kvm_pgtable *pgt = NULL;
1116 
1117 	write_lock(&kvm->mmu_lock);
1118 	pgt = mmu->pgt;
1119 	if (pgt) {
1120 		mmu->pgd_phys = 0;
1121 		mmu->pgt = NULL;
1122 		free_percpu(mmu->last_vcpu_ran);
1123 	}
1124 	write_unlock(&kvm->mmu_lock);
1125 
1126 	if (pgt) {
1127 		KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
1128 		kfree(pgt);
1129 	}
1130 }
1131 
hyp_mc_free_fn(void * addr,void * flags,unsigned long order)1132 static void hyp_mc_free_fn(void *addr, void *flags, unsigned long order)
1133 {
1134 	static const u8 pmd_order = PMD_SHIFT - PAGE_SHIFT;
1135 
1136 	if (!addr)
1137 		return;
1138 
1139 	if ((unsigned long)flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
1140 		kvm_account_pgtable_pages(addr, -1);
1141 
1142 	/* The iommu pool supports top-up from dma_contiguous_default_area */
1143 	if (order == pmd_order &&
1144 	    kvm_iommu_cma_release(virt_to_page(addr)))
1145 		return;
1146 
1147 	free_pages((unsigned long)addr, order);
1148 }
1149 
hyp_mc_alloc_fn(void * flags,unsigned long order)1150 static void *hyp_mc_alloc_fn(void *flags, unsigned long order)
1151 {
1152 	unsigned long __flags = (unsigned long)flags;
1153 	gfp_t gfp_mask;
1154 	void *addr;
1155 
1156 	gfp_mask = __flags & HYP_MEMCACHE_ACCOUNT_KMEMCG ?
1157 		   GFP_KERNEL_ACCOUNT : GFP_KERNEL;
1158 
1159 	addr = (void *)__get_free_pages(gfp_mask, order);
1160 
1161 	if (addr && __flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
1162 		kvm_account_pgtable_pages(addr, 1);
1163 
1164 	return addr;
1165 }
1166 
hyp_mc_alloc_gfp_fn(void * flags,unsigned long order)1167 static void *hyp_mc_alloc_gfp_fn(void *flags, unsigned long order)
1168 {
1169 	return (void *)__get_free_pages(*(gfp_t *)flags, order);
1170 }
1171 
free_hyp_memcache(struct kvm_hyp_memcache * mc)1172 void free_hyp_memcache(struct kvm_hyp_memcache *mc)
1173 {
1174 	unsigned long flags = mc->flags;
1175 
1176 	if (!is_protected_kvm_enabled())
1177 		return;
1178 
1179 	kfree(mc->mapping);
1180 	__free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, (void *)flags);
1181 }
1182 
topup_hyp_memcache(struct kvm_hyp_memcache * mc,unsigned long min_pages,unsigned long order)1183 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
1184 		       unsigned long order)
1185 {
1186 	unsigned long flags = mc->flags;
1187 
1188 	if (!is_protected_kvm_enabled())
1189 		return 0;
1190 
1191 	if (order > PAGE_SHIFT)
1192 		return -E2BIG;
1193 
1194 	if (!mc->mapping) {
1195 		mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT);
1196 		if (!mc->mapping)
1197 			return -ENOMEM;
1198 	}
1199 
1200 	return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
1201 				    kvm_host_pa, (void *)flags, order);
1202 }
1203 EXPORT_SYMBOL(topup_hyp_memcache);
1204 
topup_hyp_memcache_account(struct kvm * kvm,struct kvm_hyp_memcache * mc,unsigned long min_pages,unsigned long order)1205 static int topup_hyp_memcache_account(struct kvm *kvm, struct kvm_hyp_memcache *mc,
1206 				      unsigned long min_pages, unsigned long order)
1207 {
1208 	u64 nr_pages = mc->nr_pages;
1209 	int ret;
1210 
1211 	ret = topup_hyp_memcache(mc, min_pages, order);
1212 	if (ret)
1213 		return -ENOMEM;
1214 
1215 	nr_pages = mc->nr_pages - nr_pages;
1216 	atomic64_add(nr_pages << PAGE_SHIFT, &kvm->stat.protected_hyp_mem);
1217 
1218 	return 0;
1219 }
1220 
topup_hyp_memcache_gfp(struct kvm_hyp_memcache * mc,unsigned long min_pages,unsigned long order,gfp_t gfp)1221 int topup_hyp_memcache_gfp(struct kvm_hyp_memcache *mc, unsigned long min_pages,
1222 			   unsigned long order, gfp_t gfp)
1223 {
1224 	void *flags = &gfp;
1225 
1226 	if (!is_protected_kvm_enabled())
1227 		return 0;
1228 
1229 	if (order > PAGE_SHIFT)
1230 		return -E2BIG;
1231 
1232 	return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_gfp_fn,
1233 				    kvm_host_pa, flags, order);
1234 }
1235 
1236 /**
1237  * kvm_phys_addr_ioremap - map a device range to guest IPA
1238  *
1239  * @kvm:	The KVM pointer
1240  * @guest_ipa:	The IPA at which to insert the mapping
1241  * @pa:		The physical address of the device
1242  * @size:	The size of the mapping
1243  * @writable:   Whether or not to create a writable mapping
1244  */
kvm_phys_addr_ioremap(struct kvm * kvm,phys_addr_t guest_ipa,phys_addr_t pa,unsigned long size,bool writable)1245 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1246 			  phys_addr_t pa, unsigned long size, bool writable)
1247 {
1248 	phys_addr_t addr;
1249 	int ret = 0;
1250 	struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
1251 	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
1252 	struct kvm_pgtable *pgt = mmu->pgt;
1253 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
1254 				     KVM_PGTABLE_PROT_R |
1255 				     (writable ? KVM_PGTABLE_PROT_W : 0);
1256 
1257 	if (is_protected_kvm_enabled())
1258 		return -EPERM;
1259 
1260 	size += offset_in_page(guest_ipa);
1261 	guest_ipa &= PAGE_MASK;
1262 
1263 	for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
1264 		ret = kvm_mmu_topup_memory_cache(&cache,
1265 						 kvm_mmu_cache_min_pages(mmu));
1266 		if (ret)
1267 			break;
1268 
1269 		write_lock(&kvm->mmu_lock);
1270 		ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE,
1271 				 pa, prot, &cache, 0);
1272 		write_unlock(&kvm->mmu_lock);
1273 		if (ret)
1274 			break;
1275 
1276 		pa += PAGE_SIZE;
1277 	}
1278 
1279 	kvm_mmu_free_memory_cache(&cache);
1280 	return ret;
1281 }
1282 
__stage2_wp_range(struct kvm_s2_mmu * mmu,u64 addr,u64 size)1283 static int __stage2_wp_range(struct kvm_s2_mmu *mmu, u64 addr, u64 size)
1284 {
1285 	return KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)(mmu->pgt, addr, size);
1286 }
1287 
1288 /**
1289  * kvm_stage2_wp_range() - write protect stage2 memory region range
1290  * @mmu:        The KVM stage-2 MMU pointer
1291  * @addr:	Start address of range
1292  * @end:	End address of range
1293  */
kvm_stage2_wp_range(struct kvm_s2_mmu * mmu,phys_addr_t addr,phys_addr_t end)1294 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
1295 {
1296 	stage2_apply_range_resched(mmu, addr, end, __stage2_wp_range);
1297 }
1298 
1299 /**
1300  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1301  * @kvm:	The KVM pointer
1302  * @slot:	The memory slot to write protect
1303  *
1304  * Called to start logging dirty pages after memory region
1305  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1306  * all present PUD, PMD and PTEs are write protected in the memory region.
1307  * Afterwards read of dirty page log can be called.
1308  *
1309  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1310  * serializing operations for VM memory regions.
1311  */
kvm_mmu_wp_memory_region(struct kvm * kvm,int slot)1312 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1313 {
1314 	struct kvm_memslots *slots = kvm_memslots(kvm);
1315 	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1316 	phys_addr_t start, end;
1317 
1318 	if (WARN_ON_ONCE(!memslot))
1319 		return;
1320 
1321 	start = memslot->base_gfn << PAGE_SHIFT;
1322 	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1323 
1324 	write_lock(&kvm->mmu_lock);
1325 	kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
1326 	kvm_nested_s2_wp(kvm);
1327 	write_unlock(&kvm->mmu_lock);
1328 	kvm_flush_remote_tlbs_memslot(kvm, memslot);
1329 }
1330 
1331 /**
1332  * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
1333  *				   pages for memory slot
1334  * @kvm:	The KVM pointer
1335  * @slot:	The memory slot to split
1336  *
1337  * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
1338  * serializing operations for VM memory regions.
1339  */
kvm_mmu_split_memory_region(struct kvm * kvm,int slot)1340 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
1341 {
1342 	struct kvm_memslots *slots;
1343 	struct kvm_memory_slot *memslot;
1344 	phys_addr_t start, end;
1345 
1346 	lockdep_assert_held(&kvm->slots_lock);
1347 
1348 	slots = kvm_memslots(kvm);
1349 	memslot = id_to_memslot(slots, slot);
1350 
1351 	start = memslot->base_gfn << PAGE_SHIFT;
1352 	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1353 
1354 	write_lock(&kvm->mmu_lock);
1355 	kvm_mmu_split_huge_pages(kvm, start, end);
1356 	write_unlock(&kvm->mmu_lock);
1357 }
1358 
1359 /*
1360  * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
1361  * @kvm:	The KVM pointer
1362  * @slot:	The memory slot associated with mask
1363  * @gfn_offset:	The gfn offset in memory slot
1364  * @mask:	The mask of pages at offset 'gfn_offset' in this memory
1365  *		slot to enable dirty logging on
1366  *
1367  * Writes protect selected pages to enable dirty logging, and then
1368  * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
1369  */
kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)1370 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1371 		struct kvm_memory_slot *slot,
1372 		gfn_t gfn_offset, unsigned long mask)
1373 {
1374 	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1375 	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1376 	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1377 
1378 	lockdep_assert_held_write(&kvm->mmu_lock);
1379 
1380 	kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
1381 
1382 	/*
1383 	 * Eager-splitting is done when manual-protect is set.  We
1384 	 * also check for initially-all-set because we can avoid
1385 	 * eager-splitting if initially-all-set is false.
1386 	 * Initially-all-set equal false implies that huge-pages were
1387 	 * already split when enabling dirty logging: no need to do it
1388 	 * again.
1389 	 */
1390 	if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1391 		kvm_mmu_split_huge_pages(kvm, start, end);
1392 
1393 	kvm_nested_s2_wp(kvm);
1394 }
1395 
kvm_send_hwpoison_signal(unsigned long address,short lsb)1396 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1397 {
1398 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1399 }
1400 
fault_supports_stage2_huge_mapping(struct kvm_memory_slot * memslot,unsigned long hva,unsigned long map_size)1401 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1402 					       unsigned long hva,
1403 					       unsigned long map_size)
1404 {
1405 	gpa_t gpa_start;
1406 	hva_t uaddr_start, uaddr_end;
1407 	size_t size;
1408 
1409 	/* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1410 	if (map_size == PAGE_SIZE)
1411 		return true;
1412 
1413 	size = memslot->npages * PAGE_SIZE;
1414 
1415 	gpa_start = memslot->base_gfn << PAGE_SHIFT;
1416 
1417 	uaddr_start = memslot->userspace_addr;
1418 	uaddr_end = uaddr_start + size;
1419 
1420 	/*
1421 	 * Pages belonging to memslots that don't have the same alignment
1422 	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1423 	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1424 	 *
1425 	 * Consider a layout like the following:
1426 	 *
1427 	 *    memslot->userspace_addr:
1428 	 *    +-----+--------------------+--------------------+---+
1429 	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1430 	 *    +-----+--------------------+--------------------+---+
1431 	 *
1432 	 *    memslot->base_gfn << PAGE_SHIFT:
1433 	 *      +---+--------------------+--------------------+-----+
1434 	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1435 	 *      +---+--------------------+--------------------+-----+
1436 	 *
1437 	 * If we create those stage-2 blocks, we'll end up with this incorrect
1438 	 * mapping:
1439 	 *   d -> f
1440 	 *   e -> g
1441 	 *   f -> h
1442 	 */
1443 	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1444 		return false;
1445 
1446 	/*
1447 	 * Next, let's make sure we're not trying to map anything not covered
1448 	 * by the memslot. This means we have to prohibit block size mappings
1449 	 * for the beginning and end of a non-block aligned and non-block sized
1450 	 * memory slot (illustrated by the head and tail parts of the
1451 	 * userspace view above containing pages 'abcde' and 'xyz',
1452 	 * respectively).
1453 	 *
1454 	 * Note that it doesn't matter if we do the check using the
1455 	 * userspace_addr or the base_gfn, as both are equally aligned (per
1456 	 * the check above) and equally sized.
1457 	 */
1458 	return (hva & ~(map_size - 1)) >= uaddr_start &&
1459 	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1460 }
1461 
1462 /*
1463  * Check if the given hva is backed by a transparent huge page (THP) and
1464  * whether it can be mapped using block mapping in stage2. If so, adjust
1465  * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1466  * supported. This will need to be updated to support other THP sizes.
1467  *
1468  * Returns the size of the mapping.
1469  */
1470 static long
transparent_hugepage_adjust(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long hva,kvm_pfn_t * pfnp,phys_addr_t * ipap)1471 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
1472 			    unsigned long hva, kvm_pfn_t *pfnp,
1473 			    phys_addr_t *ipap)
1474 {
1475 	kvm_pfn_t pfn = *pfnp;
1476 
1477 	/*
1478 	 * Make sure the adjustment is done only for THP pages. Also make
1479 	 * sure that the HVA and IPA are sufficiently aligned and that the
1480 	 * block map is contained within the memslot.
1481 	 */
1482 	if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1483 		int sz = get_user_mapping_size(kvm, hva);
1484 
1485 		if (sz < 0)
1486 			return sz;
1487 
1488 		if (sz < PMD_SIZE)
1489 			return PAGE_SIZE;
1490 
1491 		*ipap &= PMD_MASK;
1492 		pfn &= ~(PTRS_PER_PMD - 1);
1493 		*pfnp = pfn;
1494 
1495 		return PMD_SIZE;
1496 	}
1497 
1498 	/* Use page mapping if we cannot use block mapping. */
1499 	return PAGE_SIZE;
1500 }
1501 
get_vma_page_shift(struct vm_area_struct * vma,unsigned long hva)1502 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
1503 {
1504 	unsigned long pa;
1505 
1506 	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
1507 		return huge_page_shift(hstate_vma(vma));
1508 
1509 	if (!(vma->vm_flags & VM_PFNMAP))
1510 		return PAGE_SHIFT;
1511 
1512 	VM_BUG_ON(is_vm_hugetlb_page(vma));
1513 
1514 	pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
1515 
1516 #ifndef __PAGETABLE_PMD_FOLDED
1517 	if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
1518 	    ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
1519 	    ALIGN(hva, PUD_SIZE) <= vma->vm_end)
1520 		return PUD_SHIFT;
1521 #endif
1522 
1523 	if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
1524 	    ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
1525 	    ALIGN(hva, PMD_SIZE) <= vma->vm_end)
1526 		return PMD_SHIFT;
1527 
1528 	return PAGE_SHIFT;
1529 }
1530 
1531 /*
1532  * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
1533  * able to see the page's tags and therefore they must be initialised first. If
1534  * PG_mte_tagged is set, tags have already been initialised.
1535  *
1536  * The race in the test/set of the PG_mte_tagged flag is handled by:
1537  * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
1538  *   racing to santise the same page
1539  * - mmap_lock protects between a VM faulting a page in and the VMM performing
1540  *   an mprotect() to add VM_MTE
1541  */
sanitise_mte_tags(struct kvm * kvm,kvm_pfn_t pfn,unsigned long size)1542 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
1543 			      unsigned long size)
1544 {
1545 	unsigned long i, nr_pages = size >> PAGE_SHIFT;
1546 	struct page *page = pfn_to_page(pfn);
1547 
1548 	if (!kvm_has_mte(kvm))
1549 		return;
1550 
1551 	for (i = 0; i < nr_pages; i++, page++) {
1552 		if (try_page_mte_tagging(page)) {
1553 			mte_clear_page_tags(page_address(page));
1554 			set_page_mte_tagged(page);
1555 		}
1556 	}
1557 }
1558 
kvm_vma_mte_allowed(struct vm_area_struct * vma)1559 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
1560 {
1561 	return vma->vm_flags & VM_MTE_ALLOWED;
1562 }
1563 
find_ppage(struct kvm * kvm,u64 ipa)1564 static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa)
1565 {
1566 	return kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages,
1567 					   ipa, ipa + PAGE_SIZE - 1);
1568 }
1569 
1570 static u16 pkvm_prefault __read_mostly;
1571 
early_pkvm_prefault_cfg(char * buf)1572 static int __init early_pkvm_prefault_cfg(char *buf)
1573 {
1574 	int ret = kstrtou16(buf, 10, &pkvm_prefault);
1575 
1576 	if (ret)
1577 		return ret;
1578 
1579 	pkvm_prefault = min(pkvm_prefault, 9);
1580 
1581 	return 0;
1582 }
1583 early_param("kvm-arm.protected_prefault", early_pkvm_prefault_cfg);
1584 
insert_ppage(struct kvm * kvm,struct kvm_pinned_page * ppage)1585 static int insert_ppage(struct kvm *kvm, struct kvm_pinned_page *ppage)
1586 {
1587 	if (find_ppage(kvm, ppage->ipa))
1588 		return -EEXIST;
1589 
1590 	kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
1591 
1592 	return 0;
1593 }
1594 
__pkvm_align_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot,gfn_t gfn,size_t size)1595 static long __pkvm_align_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1596 				 gfn_t gfn, size_t size)
1597 {
1598 	gfn_t memslot_end, gfn_end;
1599 	unsigned long hva;
1600 	bool writable;
1601 
1602 	size = PAGE_ALIGN(size);
1603 
1604 	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1605 	if (kvm_is_error_hva(hva) || (kvm->arch.pkvm.enabled && !writable))
1606 		return -EINVAL;
1607 
1608 	memslot_end = memslot->base_gfn + memslot->npages;
1609 	gfn_end = min(gfn + (size >> PAGE_SHIFT), memslot_end);
1610 
1611 	return gfn_end - gfn;
1612 }
1613 
__pkvm_pin_user_pages(struct kvm * kvm,struct kvm_memory_slot * memslot,u64 gfn,u64 nr_pages,struct page *** __pages)1614 static int __pkvm_pin_user_pages(struct kvm *kvm, struct kvm_memory_slot *memslot,
1615 				 u64 gfn, u64 nr_pages, struct page ***__pages)
1616 {
1617 	unsigned long hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
1618 	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
1619 	struct mm_struct *mm = current->mm;
1620 	struct page **pages;
1621 	long ret;
1622 	int p;
1623 
1624 	pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
1625 	if (!pages)
1626 		return -ENOMEM;
1627 
1628 	mmap_read_lock(mm);
1629 	ret = pin_user_pages(hva, nr_pages, flags, pages);
1630 	mmap_read_unlock(mm);
1631 
1632 	if (ret == -EHWPOISON) {
1633 		kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
1634 		goto err_free_pages;
1635 	} else if (ret == -EFAULT) {
1636 		/* Will try MMIO map */
1637 		ret = -EREMOTEIO;
1638 		goto err_free_pages;
1639 	} else if (ret < 0) {
1640 		ret = -EFAULT;
1641 		goto err_free_pages;
1642 	} else if (ret != nr_pages) {
1643 		nr_pages = ret;
1644 		ret = -EFAULT;
1645 		goto err_unpin_pages;
1646 	}
1647 
1648 	/*
1649 	 * We really can't deal with page-cache pages returned by GUP
1650 	 * because (a) we may trigger writeback of a page for which we
1651 	 * no longer have access and (b) page_mkclean() won't find the
1652 	 * stage-2 mapping in the rmap so we can get out-of-whack with
1653 	 * the filesystem when marking the page dirty during unpinning
1654 	 * (see cc5095747edf ("ext4: don't BUG if someone dirty pages
1655 	 * without asking ext4 first")).
1656 	 *
1657 	 * Ideally we'd just restrict ourselves to anonymous pages, but
1658 	 * we also want to allow memfd (i.e. shmem) pages, so check for
1659 	 * pages backed by swap in the knowledge that the GUP pin will
1660 	 * prevent try_to_unmap() from succeeding.
1661 	 */
1662 	for (p = 0; p < nr_pages; p++) {
1663 		if (!folio_test_swapbacked(page_folio(pages[p]))) {
1664 			ret = -EIO;
1665 			goto err_unpin_pages;
1666 		}
1667 	}
1668 
1669 	*__pages = pages;
1670 	return 0;
1671 
1672 err_unpin_pages:
1673 	unpin_user_pages(pages, nr_pages);
1674 err_free_pages:
1675 	kfree(pages);
1676 	return ret;
1677 }
1678 
1679 /*
1680  * pKVM relies on pinning the page then getting the pfn from there to map it,
1681  * However, to avoid adding overhead on the hot path with checking pfn first,
1682  * device check is done on the fail path for pin_user_pages, inside -EFAULT
1683  * case, that is possible because the VMA for the device mapping is VM_IO,
1684  * which fails in check_vma_flags() with -EFAULT
1685  */
__pkvm_mem_abort_device(struct kvm_vcpu * vcpu,struct kvm_memory_slot * memslot,gfn_t gfn,u64 nr_pages)1686 static int __pkvm_mem_abort_device(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
1687 				   gfn_t gfn, u64 nr_pages)
1688 {
1689 	while (nr_pages--) {
1690 		kvm_pfn_t pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
1691 						     kvm_is_write_fault(vcpu), NULL, NULL);
1692 		if (is_error_noslot_pfn(pfn))
1693 			return -EFAULT;
1694 
1695 		if (kvm_is_device_pfn(pfn)) {
1696 			int ret = kvm_call_hyp_nvhe(__pkvm_host_map_guest_mmio, pfn, gfn);
1697 
1698 			if (ret == -EEXIST)
1699 				ret = 0; /* We might have raced with another vCPU. */
1700 		} else {
1701 			/* Release pin from __gfn_to_pfn_memslot(). */
1702 			kvm_release_pfn_clean(pfn);
1703 			return -EFAULT;
1704 		}
1705 
1706 		gfn++;
1707 	}
1708 
1709 	return 0;
1710 }
1711 
1712 /*
1713  * Create a list of kvm_pinned_page based on the array of pages from
1714  * __pkvm_pin_pages in preparation for EL2 mapping.
1715  *
1716  * On success, this function no unpinning is necessary. On error the entire original pages array
1717  * must be unpinned.
1718  */
1719 static int
__pkvm_pages_to_ppages(struct kvm * kvm,struct kvm_memory_slot * memslot,gfn_t gfn,long * __nr_pages,struct page ** pages,struct list_head * ppages)1720 __pkvm_pages_to_ppages(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn,
1721 		       long *__nr_pages, struct page **pages, struct list_head *ppages)
1722 {
1723 	struct list_head ppage_prealloc = LIST_HEAD_INIT(ppage_prealloc);
1724 	long nr_ppages = 0, nr_pages = *__nr_pages;
1725 	struct kvm_pinned_page *ppage, *tmp;
1726 	int p, ret = 0;
1727 
1728 	/* Pre-allocate kvm_pinned_page before acquiring the mmu_lock */
1729 	for (p = 0; p < nr_pages; p++) {
1730 		ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
1731 		if (!ppage) {
1732 			ret = -ENOMEM;
1733 			goto err;
1734 		}
1735 		list_add(&ppage->list_node, &ppage_prealloc);
1736 	}
1737 
1738 	p = 0;
1739 	read_lock(&kvm->mmu_lock);
1740 	while (p < nr_pages) {
1741 		phys_addr_t ipa = gfn << PAGE_SHIFT;
1742 		long skip, page_size = PAGE_SIZE;
1743 		struct page *page = pages[p];
1744 		u64 pfn;
1745 
1746 		ppage = kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages,
1747 						    ipa, ipa + PAGE_SIZE - 1);
1748 		if (ppage) {
1749 			unpin_user_pages(&page, 1);
1750 			goto next;
1751 		}
1752 
1753 		pfn = page_to_pfn(page);
1754 
1755 		if (!kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages,
1756 						 ALIGN_DOWN(ipa, PMD_SIZE),
1757 						 ALIGN(ipa + 1, PMD_SIZE) - 1)){
1758 			unsigned long hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
1759 
1760 			page_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &ipa);
1761 		}
1762 
1763 		/* Pop a ppage from the pre-allocated list */
1764 		ppage = list_first_entry(&ppage_prealloc, struct kvm_pinned_page, list_node);
1765 		list_del_init(&ppage->list_node);
1766 
1767 		ppage->page = pfn_to_page(pfn);
1768 		ppage->ipa = ipa;
1769 		ppage->order = get_order(page_size);
1770 		list_add_tail(&ppage->list_node, ppages);
1771 		nr_ppages += 1 << ppage->order;
1772 
1773 next:
1774 		/* Number of pages to skip (covered by a THP) */
1775 		skip = ppage->order ? ALIGN(gfn + 1, 1 << ppage->order) - gfn - 1 : 0;
1776 		if (skip) {
1777 			long nr_pins = min_t(long, skip, nr_pages - p - 1);
1778 
1779 			if (nr_pins >= 1)
1780 				unpin_user_pages(&pages[p + 1], nr_pins);
1781 		}
1782 
1783 		p += skip + 1;
1784 		gfn += skip + 1;
1785 	}
1786 	read_unlock(&kvm->mmu_lock);
1787 
1788 	*__nr_pages = nr_ppages;
1789 
1790 err:
1791 	/* Free unused pre-allocated kvm_pinned_page */
1792 	list_for_each_entry_safe(ppage, tmp, &ppage_prealloc, list_node) {
1793 		list_del(&ppage->list_node);
1794 		kfree(ppage);
1795 	}
1796 
1797 	return ret;
1798 }
1799 
__pkvm_topup_stage2_memcache(struct kvm_vcpu * vcpu,struct list_head * ppages)1800 static int __pkvm_topup_stage2_memcache(struct kvm_vcpu *vcpu, struct list_head *ppages)
1801 {
1802 	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
1803 	struct kvm_s2_mmu *mmu = &vcpu->kvm->arch.mmu;
1804 	struct kvm_pinned_page *first, *last;
1805 	unsigned long nr_stage2_pages;
1806 	size_t size;
1807 
1808 	last = list_last_entry(ppages, struct kvm_pinned_page, list_node);
1809 	first = list_first_entry(ppages, struct kvm_pinned_page, list_node);
1810 	size = ALIGN(last->ipa + (PAGE_SIZE << last->order), PMD_SIZE) -
1811 	       ALIGN_DOWN(first->ipa, PMD_SIZE);
1812 
1813 	/*
1814 	 * (size n blocks) * (pages to install a stage-2 translation)
1815 	 *
1816 	 * Does not take into account possible (but unlikely) discontinuities in
1817 	 * the ppages list.
1818 	 */
1819 	nr_stage2_pages = (size >> PAGE_SHIFT) / PTRS_PER_PTE;
1820 	nr_stage2_pages *= kvm_mmu_cache_min_pages(mmu);
1821 
1822 	return topup_hyp_memcache_account(vcpu->kvm, hyp_memcache, nr_stage2_pages, 0);
1823 }
1824 
__pkvm_host_donate_guest_sglist(struct kvm_vcpu * vcpu,struct list_head * ppages)1825 static int __pkvm_host_donate_guest_sglist(struct kvm_vcpu *vcpu, struct list_head *ppages)
1826 {
1827 	struct kvm *kvm = vcpu->kvm;
1828 	int ret;
1829 
1830 	lockdep_assert_held_write(&kvm->mmu_lock);
1831 
1832 	do {
1833 		struct kvm_hyp_pinned_page *hyp_ppage = NULL;
1834 		struct kvm_pinned_page *tmp, *ppage;
1835 		int p, nr_ppages = 0;
1836 
1837 		list_for_each_entry(ppage, ppages, list_node) {
1838 			u64 pfn = page_to_pfn(ppage->page);
1839 			gfn_t gfn = ppage->ipa >> PAGE_SHIFT;
1840 
1841 			hyp_ppage = next_kvm_hyp_pinned_page(vcpu->arch.hyp_reqs, hyp_ppage, false);
1842 			if (!hyp_ppage)
1843 				break;
1844 
1845 			hyp_ppage->pfn = pfn;
1846 			hyp_ppage->gfn = gfn;
1847 			hyp_ppage->order = ppage->order;
1848 			nr_ppages++;
1849 
1850 			/* Limit the time spent at EL2 */
1851 			if (nr_ppages >= (1 << max(pkvm_prefault, 5)))
1852 				break;
1853 		}
1854 
1855 		if (hyp_ppage) {
1856 			hyp_ppage = next_kvm_hyp_pinned_page(vcpu->arch.hyp_reqs, hyp_ppage, false);
1857 			if (hyp_ppage)
1858 				hyp_ppage->order = ~((u8)0);
1859 		}
1860 
1861 		ret = kvm_call_hyp_nvhe(__pkvm_host_donate_guest_sglist);
1862 		/* See __pkvm_host_donate_guest() -EPERM comment */
1863 		if (ret == -EPERM) {
1864 			ret = 0;
1865 			break;
1866 		} else if (ret) {
1867 			break;
1868 		}
1869 
1870 		p = 0;
1871 		list_for_each_entry_safe(ppage, tmp, ppages, list_node) {
1872 			if (p++ >= nr_ppages)
1873 				break;
1874 
1875 			list_del(&ppage->list_node);
1876 			ppage->node.rb_right = ppage->node.rb_left = NULL;
1877 			WARN_ON(insert_ppage(kvm, ppage));
1878 		}
1879 	} while (!list_empty(ppages));
1880 
1881 	return ret;
1882 }
1883 
__pkvm_host_donate_guest(struct kvm_vcpu * vcpu,struct list_head * ppages)1884 static int __pkvm_host_donate_guest(struct kvm_vcpu *vcpu, struct list_head *ppages)
1885 {
1886 	struct kvm_pinned_page *ppage, *tmp;
1887 	struct kvm *kvm = vcpu->kvm;
1888 	int ret = -EINVAL; /* Empty list */
1889 
1890 	write_lock(&kvm->mmu_lock);
1891 
1892 	if (ppages->next != ppages->prev && kvm->arch.pkvm.enabled) {
1893 		ret = __pkvm_host_donate_guest_sglist(vcpu, ppages);
1894 		goto unlock;
1895 	}
1896 
1897 	list_for_each_entry_safe(ppage, tmp, ppages, list_node) {
1898 		u64 pfn = page_to_pfn(ppage->page);
1899 		gfn_t gfn = ppage->ipa >> PAGE_SHIFT;
1900 
1901 		ret = kvm_call_hyp_nvhe(__pkvm_host_donate_guest, pfn, gfn,
1902 					1 << ppage->order);
1903 		/*
1904 		 * Getting -EPERM at this point implies that the pfn has already been
1905 		 * mapped. This should only ever happen when two vCPUs faulted on the
1906 		 * same page, and the current one lost the race to do the mapping...
1907 		 *
1908 		 * ...or if we've tried to map a region containing an already mapped
1909 		 * entry.
1910 		 */
1911 		if (ret == -EPERM) {
1912 			ret = 0;
1913 		       continue;
1914 		} else if (ret) {
1915 			break;
1916 		}
1917 
1918 		list_del(&ppage->list_node);
1919 		ppage->node.rb_right = ppage->node.rb_left = NULL;
1920 		WARN_ON(insert_ppage(kvm, ppage));
1921 
1922 	}
1923 
1924 unlock:
1925 	write_unlock(&kvm->mmu_lock);
1926 
1927 	return ret;
1928 }
1929 
pkvm_mem_abort(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,size_t size,struct kvm_memory_slot * memslot)1930 static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t size,
1931 			  struct kvm_memory_slot *memslot)
1932 {
1933 	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1934 	struct kvm_pinned_page *ppage, *tmp;
1935 	struct mm_struct *mm = current->mm;
1936 	struct kvm *kvm = vcpu->kvm;
1937 	bool account_dec = false;
1938 	struct page **pages;
1939 	LIST_HEAD(ppages);
1940 	long ret, nr_pages;
1941 
1942 	if (WARN_ON(!kvm->arch.pkvm.enabled))
1943 		return -EINVAL;
1944 
1945 	nr_pages = __pkvm_align_memslot(kvm, memslot, gfn, size);
1946 	if (nr_pages < 0)
1947 		return nr_pages;
1948 
1949 	ret = __pkvm_pin_user_pages(kvm, memslot, gfn, nr_pages, &pages);
1950 	if (ret == -EHWPOISON)
1951 		return 0;
1952 	else if (ret == -EREMOTEIO)
1953 		return __pkvm_mem_abort_device(vcpu, memslot, gfn, nr_pages);
1954 	else if (ret)
1955 		return ret;
1956 
1957 	ret = __pkvm_pages_to_ppages(kvm, memslot, gfn, &nr_pages, pages, &ppages);
1958 	if (ret) {
1959 		unpin_user_pages(pages, nr_pages);
1960 		goto free_pages;
1961 	} else if (list_empty(&ppages)) {
1962 		ret = 0;
1963 		goto free_pages;
1964 	}
1965 
1966 	ret = __pkvm_topup_stage2_memcache(vcpu, &ppages);
1967 	if (ret)
1968 		goto free_ppages;
1969 
1970 	ret = account_locked_vm(mm, nr_pages, true);
1971 	if (ret)
1972 		goto free_ppages;
1973 	account_dec = true;
1974 
1975 	ret = __pkvm_host_donate_guest(vcpu, &ppages);
1976 
1977 free_ppages:
1978 	/* Pages left in the list haven't been mapped */
1979 	list_for_each_entry_safe(ppage, tmp, &ppages, list_node) {
1980 		list_del(&ppage->list_node);
1981 		unpin_user_pages(&ppage->page, 1);
1982 		if (account_dec)
1983 			account_locked_vm(mm, 1 << ppage->order, false);
1984 		kfree(ppage);
1985 	}
1986 
1987 free_pages:
1988 	kfree(pages);
1989 
1990 	return ret;
1991 }
1992 
pkvm_mem_abort_range(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,size_t size)1993 int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t size)
1994 {
1995 	phys_addr_t ipa_end = fault_ipa + size - 1;
1996 	struct kvm_memory_slot *memslot;
1997 	int idx, err = 0;
1998 
1999 	if (!PAGE_ALIGNED(size | fault_ipa))
2000 		return -EINVAL;
2001 
2002 	if (ipa_end >= BIT_ULL(get_kvm_ipa_limit()) ||
2003 	    ipa_end >= kvm_phys_size(vcpu->arch.hw_mmu) ||
2004 	    ipa_end <= fault_ipa)
2005 		return -EINVAL;
2006 
2007 	idx = srcu_read_lock(&vcpu->kvm->srcu);
2008 	memslot = gfn_to_memslot(vcpu->kvm, fault_ipa >> PAGE_SHIFT);
2009 	err = pkvm_mem_abort(vcpu, fault_ipa, size, memslot);
2010 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
2011 
2012 	return err;
2013 }
2014 
pkvm_mem_abort_prefault(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,struct kvm_memory_slot * memslot)2015 int pkvm_mem_abort_prefault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
2016 			    struct kvm_memory_slot *memslot)
2017 {
2018 	phys_addr_t memslot_start = memslot->base_gfn << PAGE_SHIFT;
2019 	size_t size = (1 << pkvm_prefault) << PAGE_SHIFT;
2020 
2021 	fault_ipa = ALIGN_DOWN(fault_ipa, size);
2022 	if (fault_ipa < memslot_start) {
2023 		size -= memslot_start - fault_ipa;
2024 		fault_ipa = memslot_start;
2025 	}
2026 
2027 	return pkvm_mem_abort(vcpu, fault_ipa, size, memslot);
2028 }
2029 
2030 /*
2031  * Splitting is only expected on the back of a relinquish guest HVC in the pKVM case, while
2032  * pkvm_pgtable_stage2_split() can be called with dirty logging.
2033  */
__pkvm_pgtable_stage2_split(struct kvm_vcpu * vcpu,phys_addr_t ipa,size_t size)2034 int __pkvm_pgtable_stage2_split(struct kvm_vcpu *vcpu, phys_addr_t ipa, size_t size)
2035 {
2036 	struct list_head ppage_prealloc = LIST_HEAD_INIT(ppage_prealloc);
2037 	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
2038 	struct kvm_pinned_page *ppage, *tmp;
2039 	struct kvm_memory_slot *memslot;
2040 	struct kvm *kvm = vcpu->kvm;
2041 	int idx, p, ret, nr_pages;
2042 	struct page **pages;
2043 	kvm_pfn_t pfn;
2044 	gfn_t gfn;
2045 
2046 	if (WARN_ON(!kvm->arch.pkvm.enabled))
2047 		return -EINVAL;
2048 
2049 	if (!IS_ALIGNED(ipa, PMD_SIZE) || size != PMD_SIZE)
2050 		return -EINVAL;
2051 
2052 	ret = topup_hyp_memcache_account(vcpu->kvm, hyp_memcache, 1, 0);
2053 	if (ret)
2054 		return ret;
2055 
2056 	/* We already have 1 pin on the Huge Page */
2057 	nr_pages = (size >> PAGE_SHIFT) - 1;
2058 	gfn = (ipa >> PAGE_SHIFT) + 1;
2059 
2060 	/* Pre-allocate kvm_pinned_page before acquiring the mmu_lock */
2061 	for (p = 0; p < nr_pages; p++) {
2062 		ppage = kzalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
2063 		if (!ppage) {
2064 			ret = -ENOMEM;
2065 			goto free_pinned_pages;
2066 		}
2067 		list_add(&ppage->list_node, &ppage_prealloc);
2068 	}
2069 
2070 	idx = srcu_read_lock(&vcpu->kvm->srcu);
2071 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
2072 	ret = __pkvm_pin_user_pages(kvm, memslot, gfn, nr_pages, &pages);
2073 	if (ret)
2074 		goto unlock_srcu;
2075 
2076 	write_lock(&kvm->mmu_lock);
2077 
2078 	ppage = find_ppage(kvm, ipa);
2079 	if (!ppage) {
2080 		ret = -EPERM;
2081 		goto end;
2082 	} else if (!ppage->order) {
2083 		ret = 0;
2084 		goto end;
2085 	}
2086 
2087 	ret = kvm_call_hyp_nvhe(__pkvm_host_split_guest, ipa >> PAGE_SHIFT, size);
2088 	if (ret)
2089 		goto end;
2090 
2091 	ppage->order = 0;
2092 
2093 	pfn = page_to_pfn(ppage->page) + 1;
2094 	ipa = ipa + PAGE_SIZE;
2095 	while (nr_pages--) {
2096 		/* Pop a ppage from the pre-allocated list */
2097 		ppage = list_first_entry(&ppage_prealloc, struct kvm_pinned_page, list_node);
2098 		list_del_init(&ppage->list_node);
2099 
2100 		ppage->page = pfn_to_page(pfn);
2101 		ppage->ipa = ipa;
2102 		ppage->order = 0;
2103 		insert_ppage(kvm, ppage);
2104 
2105 		pfn += 1;
2106 		ipa += PAGE_SIZE;
2107 	}
2108 
2109 end:
2110 	write_unlock(&kvm->mmu_lock);
2111 
2112 	if (ret)
2113 		unpin_user_pages(pages, nr_pages);
2114 	kfree(pages);
2115 
2116 unlock_srcu:
2117 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
2118 
2119 free_pinned_pages:
2120 	/* Free unused pre-allocated kvm_pinned_page */
2121 	list_for_each_entry_safe(ppage, tmp, &ppage_prealloc, list_node) {
2122 		list_del(&ppage->list_node);
2123 		kfree(ppage);
2124 	}
2125 
2126 	return ret;
2127 }
2128 
user_mem_abort(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,struct kvm_s2_trans * nested,struct kvm_memory_slot * memslot,bool fault_is_perm)2129 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
2130 			  struct kvm_s2_trans *nested,
2131 			  struct kvm_memory_slot *memslot,
2132 			  bool fault_is_perm)
2133 {
2134 	int ret = 0;
2135 	bool write_fault, writable, force_pte = false;
2136 	bool exec_fault, mte_allowed;
2137 	bool device = false, vfio_allow_any_uc = false;
2138 	unsigned long mmu_seq;
2139 	phys_addr_t ipa = fault_ipa;
2140 	struct kvm *kvm = vcpu->kvm;
2141 	struct vm_area_struct *vma;
2142 	short vma_shift;
2143 	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
2144 	void *memcache;
2145 	kvm_pfn_t pfn;
2146 	bool logging_active = memslot_is_logging(memslot);
2147 	long vma_pagesize, fault_granule;
2148 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
2149 	struct kvm_pgtable *pgt;
2150 	unsigned long hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
2151 	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
2152 
2153 	if (fault_is_perm)
2154 		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
2155 	write_fault = kvm_is_write_fault(vcpu);
2156 	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
2157 	VM_BUG_ON(write_fault && exec_fault);
2158 
2159 	if (fault_is_perm && !write_fault && !exec_fault) {
2160 		kvm_err("Unexpected L2 read permission error\n");
2161 		return -EFAULT;
2162 	}
2163 
2164 	/*
2165 	 * Permission faults just need to update the existing leaf entry,
2166 	 * and so normally don't require allocations from the memcache. The
2167 	 * only exception to this is when dirty logging is enabled at runtime
2168 	 * and a write fault needs to collapse a block entry into a table.
2169 	 */
2170 	if (!fault_is_perm || (logging_active && write_fault)) {
2171 		int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
2172 
2173 		if (!is_protected_kvm_enabled()) {
2174 			memcache = &vcpu->arch.mmu_page_cache;
2175 			ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
2176 		} else {
2177 			memcache = &vcpu->arch.stage2_mc;
2178 			ret = topup_hyp_memcache_account(kvm, memcache, min_pages, 0);
2179 		}
2180 		if (ret)
2181 			return ret;
2182 	}
2183 
2184 	/*
2185 	 * Let's check if we will get back a huge page backed by hugetlbfs, or
2186 	 * get block mapping for device MMIO region.
2187 	 */
2188 	mmap_read_lock(current->mm);
2189 	vma = vma_lookup(current->mm, hva);
2190 	if (unlikely(!vma)) {
2191 		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
2192 		mmap_read_unlock(current->mm);
2193 		return -EFAULT;
2194 	}
2195 
2196 	/*
2197 	 * logging_active is guaranteed to never be true for VM_PFNMAP
2198 	 * memslots.
2199 	 */
2200 	if (logging_active) {
2201 		force_pte = true;
2202 		vma_shift = PAGE_SHIFT;
2203 	} else {
2204 		vma_shift = get_vma_page_shift(vma, hva);
2205 	}
2206 
2207 	switch (vma_shift) {
2208 #ifndef __PAGETABLE_PMD_FOLDED
2209 	case PUD_SHIFT:
2210 		if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
2211 			break;
2212 		fallthrough;
2213 #endif
2214 	case CONT_PMD_SHIFT:
2215 		vma_shift = PMD_SHIFT;
2216 		fallthrough;
2217 	case PMD_SHIFT:
2218 		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
2219 			break;
2220 		fallthrough;
2221 	case CONT_PTE_SHIFT:
2222 		vma_shift = PAGE_SHIFT;
2223 		force_pte = true;
2224 		fallthrough;
2225 	case PAGE_SHIFT:
2226 		break;
2227 	default:
2228 		WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
2229 	}
2230 
2231 	vma_pagesize = 1UL << vma_shift;
2232 
2233 	if (nested) {
2234 		unsigned long max_map_size;
2235 
2236 		max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
2237 
2238 		ipa = kvm_s2_trans_output(nested);
2239 
2240 		/*
2241 		 * If we're about to create a shadow stage 2 entry, then we
2242 		 * can only create a block mapping if the guest stage 2 page
2243 		 * table uses at least as big a mapping.
2244 		 */
2245 		max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
2246 
2247 		/*
2248 		 * Be careful that if the mapping size falls between
2249 		 * two host sizes, take the smallest of the two.
2250 		 */
2251 		if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
2252 			max_map_size = PMD_SIZE;
2253 		else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
2254 			max_map_size = PAGE_SIZE;
2255 
2256 		force_pte = (max_map_size == PAGE_SIZE);
2257 		vma_pagesize = min(vma_pagesize, (long)max_map_size);
2258 	}
2259 
2260 	/*
2261 	 * Both the canonical IPA and fault IPA must be hugepage-aligned to
2262 	 * ensure we find the right PFN and lay down the mapping in the right
2263 	 * place.
2264 	 */
2265 	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
2266 		fault_ipa &= ~(vma_pagesize - 1);
2267 		ipa &= ~(vma_pagesize - 1);
2268 	}
2269 
2270 	gfn = fault_ipa >> PAGE_SHIFT;
2271 	mte_allowed = kvm_vma_mte_allowed(vma);
2272 
2273 	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
2274 
2275 	/* Don't use the VMA after the unlock -- it may have vanished */
2276 	vma = NULL;
2277 
2278 	/*
2279 	 * Read mmu_invalidate_seq so that KVM can detect if the results of
2280 	 * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
2281 	 * acquiring kvm->mmu_lock.
2282 	 *
2283 	 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
2284 	 * with the smp_wmb() in kvm_mmu_invalidate_end().
2285 	 */
2286 	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
2287 	mmap_read_unlock(current->mm);
2288 
2289 	pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
2290 				   write_fault, &writable, NULL);
2291 	if (pfn == KVM_PFN_ERR_HWPOISON) {
2292 		kvm_send_hwpoison_signal(hva, vma_shift);
2293 		return 0;
2294 	}
2295 	if (is_error_noslot_pfn(pfn))
2296 		return -EFAULT;
2297 
2298 	if (kvm_is_device_pfn(pfn)) {
2299 		/*
2300 		 * If the page was identified as device early by looking at
2301 		 * the VMA flags, vma_pagesize is already representing the
2302 		 * largest quantity we can map.  If instead it was mapped
2303 		 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
2304 		 * and must not be upgraded.
2305 		 *
2306 		 * In both cases, we don't let transparent_hugepage_adjust()
2307 		 * change things at the last minute.
2308 		 */
2309 		device = true;
2310 	} else if (logging_active && !write_fault) {
2311 		/*
2312 		 * Only actually map the page as writable if this was a write
2313 		 * fault.
2314 		 */
2315 		writable = false;
2316 	}
2317 
2318 	if (exec_fault && device)
2319 		return -ENOEXEC;
2320 
2321 	/*
2322 	 * Potentially reduce shadow S2 permissions to match the guest's own
2323 	 * S2. For exec faults, we'd only reach this point if the guest
2324 	 * actually allowed it (see kvm_s2_handle_perm_fault).
2325 	 *
2326 	 * Also encode the level of the original translation in the SW bits
2327 	 * of the leaf entry as a proxy for the span of that translation.
2328 	 * This will be retrieved on TLB invalidation from the guest and
2329 	 * used to limit the invalidation scope if a TTL hint or a range
2330 	 * isn't provided.
2331 	 */
2332 	if (nested) {
2333 		writable &= kvm_s2_trans_writable(nested);
2334 		if (!kvm_s2_trans_readable(nested))
2335 			prot &= ~KVM_PGTABLE_PROT_R;
2336 
2337 		prot |= kvm_encode_nested_level(nested);
2338 	}
2339 
2340 	kvm_fault_lock(kvm);
2341 	pgt = vcpu->arch.hw_mmu->pgt;
2342 	if (mmu_invalidate_retry(kvm, mmu_seq)) {
2343 		ret = -EAGAIN;
2344 		goto out_unlock;
2345 	}
2346 
2347 	/*
2348 	 * If we are not forced to use page mapping, check if we are
2349 	 * backed by a THP and thus use block mapping if possible.
2350 	 */
2351 	if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
2352 		if (fault_is_perm && fault_granule > PAGE_SIZE)
2353 			vma_pagesize = fault_granule;
2354 		else
2355 			vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
2356 								   hva, &pfn,
2357 								   &fault_ipa);
2358 
2359 		if (vma_pagesize < 0) {
2360 			ret = vma_pagesize;
2361 			goto out_unlock;
2362 		}
2363 	}
2364 
2365 	if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
2366 		/* Check the VMM hasn't introduced a new disallowed VMA */
2367 		if (mte_allowed) {
2368 			sanitise_mte_tags(kvm, pfn, vma_pagesize);
2369 		} else {
2370 			ret = -EFAULT;
2371 			goto out_unlock;
2372 		}
2373 	}
2374 
2375 	if (writable)
2376 		prot |= KVM_PGTABLE_PROT_W;
2377 
2378 	if (exec_fault)
2379 		prot |= KVM_PGTABLE_PROT_X;
2380 
2381 	if (device) {
2382 		if (vfio_allow_any_uc)
2383 			prot |= KVM_PGTABLE_PROT_NORMAL_NC;
2384 		else
2385 			prot |= KVM_PGTABLE_PROT_DEVICE;
2386 	} else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
2387 		   (!nested || kvm_s2_trans_executable(nested))) {
2388 		prot |= KVM_PGTABLE_PROT_X;
2389 	}
2390 
2391 	/*
2392 	 * Under the premise of getting a FSC_PERM fault, we just need to relax
2393 	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
2394 	 * kvm_pgtable_stage2_map() should be called to change block size.
2395 	 */
2396 	if (fault_is_perm && vma_pagesize == fault_granule) {
2397 		/*
2398 		 * Drop the SW bits in favour of those stored in the
2399 		 * PTE, which will be preserved.
2400 		 */
2401 		prot &= ~KVM_NV_GUEST_MAP_SZ;
2402 		ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
2403 	} else {
2404 		ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
2405 					     __pfn_to_phys(pfn), prot,
2406 					     memcache, flags);
2407 	}
2408 
2409 out_unlock:
2410 	kvm_fault_unlock(kvm);
2411 
2412 	/* Mark the page dirty only if the fault is handled successfully */
2413 	if (writable && !ret) {
2414 		kvm_set_pfn_dirty(pfn);
2415 		mark_page_dirty_in_slot(kvm, memslot, gfn);
2416 	}
2417 
2418 	kvm_release_pfn_clean(pfn);
2419 	return ret != -EAGAIN ? ret : 0;
2420 }
2421 
2422 /* Resolve the access fault by making the page young again. */
handle_access_fault(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa)2423 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
2424 {
2425 	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
2426 	kvm_pte_t pte;
2427 	struct kvm_s2_mmu *mmu;
2428 
2429 	trace_kvm_access_fault(fault_ipa);
2430 
2431 	read_lock(&vcpu->kvm->mmu_lock);
2432 	mmu = vcpu->arch.hw_mmu;
2433 	pte = KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags);
2434 	read_unlock(&vcpu->kvm->mmu_lock);
2435 
2436 	if (kvm_pte_valid(pte))
2437 		kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
2438 }
2439 
2440 /**
2441  * kvm_handle_guest_abort - handles all 2nd stage aborts
2442  * @vcpu:	the VCPU pointer
2443  *
2444  * Any abort that gets to the host is almost guaranteed to be caused by a
2445  * missing second stage translation table entry, which can mean that either the
2446  * guest simply needs more memory and we must allocate an appropriate page or it
2447  * can mean that the guest tried to access I/O memory, which is emulated by user
2448  * space. The distinction is based on the IPA causing the fault and whether this
2449  * memory region has been registered as standard RAM by user space.
2450  */
kvm_handle_guest_abort(struct kvm_vcpu * vcpu)2451 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
2452 {
2453 	struct kvm_s2_trans nested_trans, *nested = NULL;
2454 	unsigned long esr;
2455 	phys_addr_t fault_ipa; /* The address we faulted on */
2456 	phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
2457 	struct kvm_memory_slot *memslot;
2458 	bool is_iabt, write_fault, writable;
2459 	gfn_t gfn;
2460 	int ret, idx;
2461 
2462 	esr = kvm_vcpu_get_esr(vcpu);
2463 
2464 	ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
2465 	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
2466 
2467 	if (esr_fsc_is_translation_fault(esr)) {
2468 		/* Beyond sanitised PARange (which is the IPA limit) */
2469 		if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
2470 			kvm_inject_size_fault(vcpu);
2471 			return 1;
2472 		}
2473 
2474 		/* Falls between the IPA range and the PARange? */
2475 		if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) {
2476 			fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
2477 
2478 			if (is_iabt)
2479 				kvm_inject_pabt(vcpu, fault_ipa);
2480 			else
2481 				kvm_inject_dabt(vcpu, fault_ipa);
2482 			return 1;
2483 		}
2484 	}
2485 
2486 	/* Synchronous External Abort? */
2487 	if (kvm_vcpu_abt_issea(vcpu)) {
2488 		/*
2489 		 * For RAS the host kernel may handle this abort.
2490 		 * There is no need to pass the error into the guest.
2491 		 */
2492 		if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
2493 			kvm_inject_vabt(vcpu);
2494 
2495 		return 1;
2496 	}
2497 
2498 	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
2499 			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
2500 
2501 	/* Check the stage-2 fault is trans. fault or write fault */
2502 	if (!esr_fsc_is_translation_fault(esr) &&
2503 	    !esr_fsc_is_permission_fault(esr) &&
2504 	    !esr_fsc_is_access_flag_fault(esr)) {
2505 		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
2506 			kvm_vcpu_trap_get_class(vcpu),
2507 			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
2508 			(unsigned long)kvm_vcpu_get_esr(vcpu));
2509 		return -EFAULT;
2510 	}
2511 
2512 	idx = srcu_read_lock(&vcpu->kvm->srcu);
2513 
2514 	/*
2515 	 * We may have faulted on a shadow stage 2 page table if we are
2516 	 * running a nested guest.  In this case, we have to resolve the L2
2517 	 * IPA to the L1 IPA first, before knowing what kind of memory should
2518 	 * back the L1 IPA.
2519 	 *
2520 	 * If the shadow stage 2 page table walk faults, then we simply inject
2521 	 * this to the guest and carry on.
2522 	 *
2523 	 * If there are no shadow S2 PTs because S2 is disabled, there is
2524 	 * nothing to walk and we treat it as a 1:1 before going through the
2525 	 * canonical translation.
2526 	 */
2527 	if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
2528 	    vcpu->arch.hw_mmu->nested_stage2_enabled) {
2529 		u32 esr;
2530 
2531 		ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
2532 		if (ret) {
2533 			esr = kvm_s2_trans_esr(&nested_trans);
2534 			kvm_inject_s2_fault(vcpu, esr);
2535 			goto out_unlock;
2536 		}
2537 
2538 		ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
2539 		if (ret) {
2540 			esr = kvm_s2_trans_esr(&nested_trans);
2541 			kvm_inject_s2_fault(vcpu, esr);
2542 			goto out_unlock;
2543 		}
2544 
2545 		ipa = kvm_s2_trans_output(&nested_trans);
2546 		nested = &nested_trans;
2547 	}
2548 
2549 	gfn = ipa >> PAGE_SHIFT;
2550 	memslot = gfn_to_memslot_prot(vcpu->kvm, gfn, &writable);
2551 	write_fault = kvm_is_write_fault(vcpu);
2552 	if (!memslot || (write_fault && !writable)) {
2553 		/*
2554 		 * The guest has put either its instructions or its page-tables
2555 		 * somewhere it shouldn't have. Userspace won't be able to do
2556 		 * anything about this (there's no syndrome for a start), so
2557 		 * re-inject the abort back into the guest.
2558 		 */
2559 		if (is_iabt) {
2560 			ret = -ENOEXEC;
2561 			goto out;
2562 		}
2563 
2564 		if (kvm_vcpu_abt_iss1tw(vcpu)) {
2565 			kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2566 			ret = 1;
2567 			goto out_unlock;
2568 		}
2569 
2570 		/*
2571 		 * Check for a cache maintenance operation. Since we
2572 		 * ended-up here, we know it is outside of any memory
2573 		 * slot. But we can't find out if that is for a device,
2574 		 * or if the guest is just being stupid. The only thing
2575 		 * we know for sure is that this range cannot be cached.
2576 		 *
2577 		 * So let's assume that the guest is just being
2578 		 * cautious, and skip the instruction.
2579 		 */
2580 		if (!memslot && kvm_vcpu_dabt_is_cm(vcpu)) {
2581 			kvm_incr_pc(vcpu);
2582 			ret = 1;
2583 			goto out_unlock;
2584 		}
2585 
2586 		/*
2587 		 * The IPA is reported as [MAX:12], so we need to
2588 		 * complement it with the bottom 12 bits from the
2589 		 * faulting VA. This is always 12 bits, irrespective
2590 		 * of the page size.
2591 		 */
2592 		ipa |= kvm_vcpu_get_hfar(vcpu) & FAR_MASK;
2593 		ret = io_mem_abort(vcpu, ipa);
2594 		goto out_unlock;
2595 	}
2596 
2597 	/* Userspace should not be able to register out-of-bounds IPAs */
2598 	VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
2599 
2600 	if (esr_fsc_is_access_flag_fault(esr)) {
2601 		handle_access_fault(vcpu, fault_ipa);
2602 		ret = 1;
2603 		goto out_unlock;
2604 	}
2605 
2606 	if (kvm_vm_is_protected(vcpu->kvm))
2607 		ret = pkvm_mem_abort_prefault(vcpu, fault_ipa, memslot);
2608 	else
2609 		ret = user_mem_abort(vcpu, fault_ipa, nested, memslot,
2610 				     esr_fsc_is_permission_fault(esr));
2611 
2612 	if (ret == 0)
2613 		ret = 1;
2614 out:
2615 	if (ret == -ENOEXEC) {
2616 		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2617 		ret = 1;
2618 	}
2619 out_unlock:
2620 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
2621 	return ret;
2622 }
2623 
kvm_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)2624 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
2625 {
2626 	__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
2627 			     (range->end - range->start) << PAGE_SHIFT,
2628 			     range->may_block);
2629 
2630 	kvm_nested_s2_unmap(kvm, range->may_block);
2631 	return false;
2632 }
2633 
kvm_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)2634 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
2635 {
2636 	u64 size = (range->end - range->start) << PAGE_SHIFT;
2637 
2638 	return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
2639 						   range->start << PAGE_SHIFT,
2640 						   size, true);
2641 	/*
2642 	 * TODO: Handle nested_mmu structures here using the reverse mapping in
2643 	 * a later version of patch series.
2644 	 */
2645 }
2646 
kvm_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)2647 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
2648 {
2649 	u64 size = (range->end - range->start) << PAGE_SHIFT;
2650 
2651 	return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
2652 						   range->start << PAGE_SHIFT,
2653 						   size, false);
2654 }
2655 
kvm_mmu_get_httbr(void)2656 phys_addr_t kvm_mmu_get_httbr(void)
2657 {
2658 	return __pa(hyp_pgtable->pgd);
2659 }
2660 
kvm_get_idmap_vector(void)2661 phys_addr_t kvm_get_idmap_vector(void)
2662 {
2663 	return hyp_idmap_vector;
2664 }
2665 
kvm_map_idmap_text(void)2666 static int kvm_map_idmap_text(void)
2667 {
2668 	unsigned long size = hyp_idmap_end - hyp_idmap_start;
2669 	int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
2670 					PAGE_HYP_EXEC);
2671 	if (err)
2672 		kvm_err("Failed to idmap %lx-%lx\n",
2673 			hyp_idmap_start, hyp_idmap_end);
2674 
2675 	return err;
2676 }
2677 
kvm_hyp_zalloc_page(void * arg)2678 static void *kvm_hyp_zalloc_page(void *arg)
2679 {
2680 	return (void *)get_zeroed_page(GFP_KERNEL);
2681 }
2682 
2683 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
2684 	.zalloc_page		= kvm_hyp_zalloc_page,
2685 	.get_page		= kvm_host_get_page,
2686 	.put_page		= kvm_host_put_page,
2687 	.phys_to_virt		= kvm_host_va,
2688 	.virt_to_phys		= kvm_host_pa,
2689 };
2690 
kvm_mmu_init(void)2691 int __init kvm_mmu_init(void)
2692 {
2693 	int err;
2694 	u32 idmap_bits;
2695 	u32 kernel_bits;
2696 
2697 	hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
2698 	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2699 	hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
2700 	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2701 	hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
2702 
2703 	/*
2704 	 * We rely on the linker script to ensure at build time that the HYP
2705 	 * init code does not cross a page boundary.
2706 	 */
2707 	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2708 
2709 	/*
2710 	 * The ID map is always configured for 48 bits of translation, which
2711 	 * may be fewer than the number of VA bits used by the regular kernel
2712 	 * stage 1, when VA_BITS=52.
2713 	 *
2714 	 * At EL2, there is only one TTBR register, and we can't switch between
2715 	 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
2716 	 * line: we need to use the extended range with *both* our translation
2717 	 * tables.
2718 	 *
2719 	 * So use the maximum of the idmap VA bits and the regular kernel stage
2720 	 * 1 VA bits to assure that the hypervisor can both ID map its code page
2721 	 * and map any kernel memory.
2722 	 */
2723 	idmap_bits = IDMAP_VA_BITS;
2724 	kernel_bits = vabits_actual;
2725 	hyp_va_bits = max(idmap_bits, kernel_bits);
2726 
2727 	kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
2728 	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2729 	kvm_debug("HYP VA range: %lx:%lx\n",
2730 		  kern_hyp_va(PAGE_OFFSET),
2731 		  kern_hyp_va((unsigned long)high_memory - 1));
2732 
2733 	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2734 	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
2735 	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2736 		/*
2737 		 * The idmap page is intersecting with the VA space,
2738 		 * it is not safe to continue further.
2739 		 */
2740 		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2741 		err = -EINVAL;
2742 		goto out;
2743 	}
2744 
2745 	hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
2746 	if (!hyp_pgtable) {
2747 		kvm_err("Hyp mode page-table not allocated\n");
2748 		err = -ENOMEM;
2749 		goto out;
2750 	}
2751 
2752 	err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits, &kvm_hyp_mm_ops);
2753 	if (err)
2754 		goto out_free_pgtable;
2755 
2756 	err = kvm_map_idmap_text();
2757 	if (err)
2758 		goto out_destroy_pgtable;
2759 
2760 	io_map_base = hyp_idmap_start;
2761 	return 0;
2762 
2763 out_destroy_pgtable:
2764 	kvm_pgtable_hyp_destroy(hyp_pgtable);
2765 out_free_pgtable:
2766 	kfree(hyp_pgtable);
2767 	hyp_pgtable = NULL;
2768 out:
2769 	return err;
2770 }
2771 
kvm_arch_commit_memory_region(struct kvm * kvm,struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)2772 void kvm_arch_commit_memory_region(struct kvm *kvm,
2773 				   struct kvm_memory_slot *old,
2774 				   const struct kvm_memory_slot *new,
2775 				   enum kvm_mr_change change)
2776 {
2777 	bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
2778 
2779 	/*
2780 	 * At this point memslot has been committed and there is an
2781 	 * allocated dirty_bitmap[], dirty pages will be tracked while the
2782 	 * memory slot is write protected.
2783 	 */
2784 	if (log_dirty_pages) {
2785 
2786 		if (change == KVM_MR_DELETE)
2787 			return;
2788 
2789 		/*
2790 		 * Huge and normal pages are write-protected and split
2791 		 * on either of these two cases:
2792 		 *
2793 		 * 1. with initial-all-set: gradually with CLEAR ioctls,
2794 		 */
2795 		if (kvm_dirty_log_manual_protect_and_init_set(kvm))
2796 			return;
2797 		/*
2798 		 * or
2799 		 * 2. without initial-all-set: all in one shot when
2800 		 *    enabling dirty logging.
2801 		 */
2802 		kvm_mmu_wp_memory_region(kvm, new->id);
2803 		kvm_mmu_split_memory_region(kvm, new->id);
2804 	} else {
2805 		/*
2806 		 * Free any leftovers from the eager page splitting cache. Do
2807 		 * this when deleting, moving, disabling dirty logging, or
2808 		 * creating the memslot (a nop). Doing it for deletes makes
2809 		 * sure we don't leak memory, and there's no need to keep the
2810 		 * cache around for any of the other cases.
2811 		 */
2812 		kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
2813 	}
2814 }
2815 
kvm_arch_prepare_memory_region(struct kvm * kvm,const struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)2816 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2817 				   const struct kvm_memory_slot *old,
2818 				   struct kvm_memory_slot *new,
2819 				   enum kvm_mr_change change)
2820 {
2821 	hva_t hva, reg_end;
2822 	int ret = 0;
2823 
2824 	if (is_protected_kvm_enabled()) {
2825 		if ((change == KVM_MR_DELETE || change == KVM_MR_MOVE) &&
2826 		    pkvm_is_hyp_created(kvm) && kvm->arch.pkvm.enabled) {
2827 			return -EPERM;
2828 		}
2829 
2830 		if (new && kvm->arch.pkvm.enabled &&
2831 		    new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) {
2832 			return -EPERM;
2833 		}
2834 	}
2835 
2836 	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2837 			change != KVM_MR_FLAGS_ONLY)
2838 		return 0;
2839 
2840 	/*
2841 	 * Prevent userspace from creating a memory region outside of the IPA
2842 	 * space addressable by the KVM guest IPA space.
2843 	 */
2844 	if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
2845 		return -EFAULT;
2846 
2847 	hva = new->userspace_addr;
2848 	reg_end = hva + (new->npages << PAGE_SHIFT);
2849 
2850 	mmap_read_lock(current->mm);
2851 	/*
2852 	 * A memory region could potentially cover multiple VMAs, and any holes
2853 	 * between them, so iterate over all of them.
2854 	 *
2855 	 *     +--------------------------------------------+
2856 	 * +---------------+----------------+   +----------------+
2857 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2858 	 * +---------------+----------------+   +----------------+
2859 	 *     |               memory region                |
2860 	 *     +--------------------------------------------+
2861 	 */
2862 	do {
2863 		struct vm_area_struct *vma;
2864 
2865 		vma = find_vma_intersection(current->mm, hva, reg_end);
2866 		if (!vma)
2867 			break;
2868 
2869 		if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
2870 			ret = -EINVAL;
2871 			break;
2872 		}
2873 
2874 		if (vma->vm_flags & VM_PFNMAP) {
2875 			/* IO region dirty page logging not allowed */
2876 			if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2877 				ret = -EINVAL;
2878 				break;
2879 			}
2880 		}
2881 		hva = min(reg_end, vma->vm_end);
2882 	} while (hva < reg_end);
2883 
2884 	mmap_read_unlock(current->mm);
2885 	return ret;
2886 }
2887 
kvm_arch_free_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)2888 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2889 {
2890 }
2891 
kvm_arch_memslots_updated(struct kvm * kvm,u64 gen)2892 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2893 {
2894 }
2895 
kvm_arch_flush_shadow_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)2896 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2897 				   struct kvm_memory_slot *slot)
2898 {
2899 	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2900 	phys_addr_t size = slot->npages << PAGE_SHIFT;
2901 
2902 	write_lock(&kvm->mmu_lock);
2903 	kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
2904 	kvm_nested_s2_unmap(kvm, true);
2905 	write_unlock(&kvm->mmu_lock);
2906 }
2907 
2908 /*
2909  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2910  *
2911  * Main problems:
2912  * - S/W ops are local to a CPU (not broadcast)
2913  * - We have line migration behind our back (speculation)
2914  * - System caches don't support S/W at all (damn!)
2915  *
2916  * In the face of the above, the best we can do is to try and convert
2917  * S/W ops to VA ops. Because the guest is not allowed to infer the
2918  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2919  * which is a rather good thing for us.
2920  *
2921  * Also, it is only used when turning caches on/off ("The expected
2922  * usage of the cache maintenance instructions that operate by set/way
2923  * is associated with the cache maintenance instructions associated
2924  * with the powerdown and powerup of caches, if this is required by
2925  * the implementation.").
2926  *
2927  * We use the following policy:
2928  *
2929  * - If we trap a S/W operation, we enable VM trapping to detect
2930  *   caches being turned on/off, and do a full clean.
2931  *
2932  * - We flush the caches on both caches being turned on and off.
2933  *
2934  * - Once the caches are enabled, we stop trapping VM ops.
2935  */
kvm_set_way_flush(struct kvm_vcpu * vcpu)2936 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2937 {
2938 	unsigned long hcr = *vcpu_hcr(vcpu);
2939 
2940 	/*
2941 	 * If this is the first time we do a S/W operation
2942 	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2943 	 * VM trapping.
2944 	 *
2945 	 * Otherwise, rely on the VM trapping to wait for the MMU +
2946 	 * Caches to be turned off. At that point, we'll be able to
2947 	 * clean the caches again.
2948 	 */
2949 	if (!(hcr & HCR_TVM)) {
2950 		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2951 					vcpu_has_cache_enabled(vcpu));
2952 		stage2_flush_vm(vcpu->kvm);
2953 		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
2954 	}
2955 }
2956 
kvm_toggle_cache(struct kvm_vcpu * vcpu,bool was_enabled)2957 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2958 {
2959 	bool now_enabled = vcpu_has_cache_enabled(vcpu);
2960 
2961 	/*
2962 	 * If switching the MMU+caches on, need to invalidate the caches.
2963 	 * If switching it off, need to clean the caches.
2964 	 * Clean + invalidate does the trick always.
2965 	 */
2966 	if (now_enabled != was_enabled)
2967 		stage2_flush_vm(vcpu->kvm);
2968 
2969 	/* Caches are now on, stop trapping VM ops (until a S/W op) */
2970 	if (now_enabled)
2971 		*vcpu_hcr(vcpu) &= ~HCR_TVM;
2972 
2973 	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2974 }
2975