• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5  */
6 
7 #include <linux/mman.h>
8 #include <linux/kvm_host.h>
9 #include <linux/io.h>
10 #include <linux/hugetlb.h>
11 #include <linux/sched/signal.h>
12 #include <trace/events/kvm.h>
13 #include <asm/pgalloc.h>
14 #include <asm/cacheflush.h>
15 #include <asm/kvm_arm.h>
16 #include <asm/kvm_mmu.h>
17 #include <asm/kvm_mmio.h>
18 #include <asm/kvm_ras.h>
19 #include <asm/kvm_asm.h>
20 #include <asm/kvm_emulate.h>
21 #include <asm/virt.h>
22 
23 #include "trace.h"
24 
25 static pgd_t *boot_hyp_pgd;
26 static pgd_t *hyp_pgd;
27 static pgd_t *merged_hyp_pgd;
28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
29 
30 static unsigned long hyp_idmap_start;
31 static unsigned long hyp_idmap_end;
32 static phys_addr_t hyp_idmap_vector;
33 
34 static unsigned long io_map_base;
35 
36 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
37 
38 #define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
39 #define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
40 
is_iomap(unsigned long flags)41 static bool is_iomap(unsigned long flags)
42 {
43 	return flags & KVM_S2PTE_FLAG_IS_IOMAP;
44 }
45 
memslot_is_logging(struct kvm_memory_slot * memslot)46 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
47 {
48 	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
49 }
50 
51 /**
52  * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
53  * @kvm:	pointer to kvm structure.
54  *
55  * Interface to HYP function to flush all VM TLB entries
56  */
kvm_flush_remote_tlbs(struct kvm * kvm)57 void kvm_flush_remote_tlbs(struct kvm *kvm)
58 {
59 	kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
60 }
61 
kvm_tlb_flush_vmid_ipa(struct kvm * kvm,phys_addr_t ipa)62 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
63 {
64 	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
65 }
66 
67 /*
68  * D-Cache management functions. They take the page table entries by
69  * value, as they are flushing the cache using the kernel mapping (or
70  * kmap on 32bit).
71  */
kvm_flush_dcache_pte(pte_t pte)72 static void kvm_flush_dcache_pte(pte_t pte)
73 {
74 	__kvm_flush_dcache_pte(pte);
75 }
76 
kvm_flush_dcache_pmd(pmd_t pmd)77 static void kvm_flush_dcache_pmd(pmd_t pmd)
78 {
79 	__kvm_flush_dcache_pmd(pmd);
80 }
81 
kvm_flush_dcache_pud(pud_t pud)82 static void kvm_flush_dcache_pud(pud_t pud)
83 {
84 	__kvm_flush_dcache_pud(pud);
85 }
86 
kvm_is_device_pfn(unsigned long pfn)87 static bool kvm_is_device_pfn(unsigned long pfn)
88 {
89 	return !pfn_valid(pfn);
90 }
91 
92 /**
93  * stage2_dissolve_pmd() - clear and flush huge PMD entry
94  * @kvm:	pointer to kvm structure.
95  * @addr:	IPA
96  * @pmd:	pmd pointer for IPA
97  *
98  * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
99  */
stage2_dissolve_pmd(struct kvm * kvm,phys_addr_t addr,pmd_t * pmd)100 static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
101 {
102 	if (!pmd_thp_or_huge(*pmd))
103 		return;
104 
105 	pmd_clear(pmd);
106 	kvm_tlb_flush_vmid_ipa(kvm, addr);
107 	put_page(virt_to_page(pmd));
108 }
109 
110 /**
111  * stage2_dissolve_pud() - clear and flush huge PUD entry
112  * @kvm:	pointer to kvm structure.
113  * @addr:	IPA
114  * @pud:	pud pointer for IPA
115  *
116  * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
117  */
stage2_dissolve_pud(struct kvm * kvm,phys_addr_t addr,pud_t * pudp)118 static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
119 {
120 	if (!stage2_pud_huge(kvm, *pudp))
121 		return;
122 
123 	stage2_pud_clear(kvm, pudp);
124 	kvm_tlb_flush_vmid_ipa(kvm, addr);
125 	put_page(virt_to_page(pudp));
126 }
127 
mmu_topup_memory_cache(struct kvm_mmu_memory_cache * cache,int min,int max)128 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
129 				  int min, int max)
130 {
131 	void *page;
132 
133 	BUG_ON(max > KVM_NR_MEM_OBJS);
134 	if (cache->nobjs >= min)
135 		return 0;
136 	while (cache->nobjs < max) {
137 		page = (void *)__get_free_page(GFP_PGTABLE_USER);
138 		if (!page)
139 			return -ENOMEM;
140 		cache->objects[cache->nobjs++] = page;
141 	}
142 	return 0;
143 }
144 
mmu_free_memory_cache(struct kvm_mmu_memory_cache * mc)145 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
146 {
147 	while (mc->nobjs)
148 		free_page((unsigned long)mc->objects[--mc->nobjs]);
149 }
150 
mmu_memory_cache_alloc(struct kvm_mmu_memory_cache * mc)151 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
152 {
153 	void *p;
154 
155 	BUG_ON(!mc || !mc->nobjs);
156 	p = mc->objects[--mc->nobjs];
157 	return p;
158 }
159 
clear_stage2_pgd_entry(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr)160 static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
161 {
162 	pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
163 	stage2_pgd_clear(kvm, pgd);
164 	kvm_tlb_flush_vmid_ipa(kvm, addr);
165 	stage2_pud_free(kvm, pud_table);
166 	put_page(virt_to_page(pgd));
167 }
168 
clear_stage2_pud_entry(struct kvm * kvm,pud_t * pud,phys_addr_t addr)169 static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
170 {
171 	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
172 	VM_BUG_ON(stage2_pud_huge(kvm, *pud));
173 	stage2_pud_clear(kvm, pud);
174 	kvm_tlb_flush_vmid_ipa(kvm, addr);
175 	stage2_pmd_free(kvm, pmd_table);
176 	put_page(virt_to_page(pud));
177 }
178 
clear_stage2_pmd_entry(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr)179 static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
180 {
181 	pte_t *pte_table = pte_offset_kernel(pmd, 0);
182 	VM_BUG_ON(pmd_thp_or_huge(*pmd));
183 	pmd_clear(pmd);
184 	kvm_tlb_flush_vmid_ipa(kvm, addr);
185 	free_page((unsigned long)pte_table);
186 	put_page(virt_to_page(pmd));
187 }
188 
kvm_set_pte(pte_t * ptep,pte_t new_pte)189 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
190 {
191 	WRITE_ONCE(*ptep, new_pte);
192 	dsb(ishst);
193 }
194 
kvm_set_pmd(pmd_t * pmdp,pmd_t new_pmd)195 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
196 {
197 	WRITE_ONCE(*pmdp, new_pmd);
198 	dsb(ishst);
199 }
200 
kvm_pmd_populate(pmd_t * pmdp,pte_t * ptep)201 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
202 {
203 	kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
204 }
205 
kvm_pud_populate(pud_t * pudp,pmd_t * pmdp)206 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
207 {
208 	WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
209 	dsb(ishst);
210 }
211 
kvm_pgd_populate(pgd_t * pgdp,pud_t * pudp)212 static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp)
213 {
214 	WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp));
215 	dsb(ishst);
216 }
217 
218 /*
219  * Unmapping vs dcache management:
220  *
221  * If a guest maps certain memory pages as uncached, all writes will
222  * bypass the data cache and go directly to RAM.  However, the CPUs
223  * can still speculate reads (not writes) and fill cache lines with
224  * data.
225  *
226  * Those cache lines will be *clean* cache lines though, so a
227  * clean+invalidate operation is equivalent to an invalidate
228  * operation, because no cache lines are marked dirty.
229  *
230  * Those clean cache lines could be filled prior to an uncached write
231  * by the guest, and the cache coherent IO subsystem would therefore
232  * end up writing old data to disk.
233  *
234  * This is why right after unmapping a page/section and invalidating
235  * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
236  * the IO subsystem will never hit in the cache.
237  *
238  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
239  * we then fully enforce cacheability of RAM, no matter what the guest
240  * does.
241  */
unmap_stage2_ptes(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr,phys_addr_t end)242 static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
243 		       phys_addr_t addr, phys_addr_t end)
244 {
245 	phys_addr_t start_addr = addr;
246 	pte_t *pte, *start_pte;
247 
248 	start_pte = pte = pte_offset_kernel(pmd, addr);
249 	do {
250 		if (!pte_none(*pte)) {
251 			pte_t old_pte = *pte;
252 
253 			kvm_set_pte(pte, __pte(0));
254 			kvm_tlb_flush_vmid_ipa(kvm, addr);
255 
256 			/* No need to invalidate the cache for device mappings */
257 			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
258 				kvm_flush_dcache_pte(old_pte);
259 
260 			put_page(virt_to_page(pte));
261 		}
262 	} while (pte++, addr += PAGE_SIZE, addr != end);
263 
264 	if (stage2_pte_table_empty(kvm, start_pte))
265 		clear_stage2_pmd_entry(kvm, pmd, start_addr);
266 }
267 
unmap_stage2_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)268 static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
269 		       phys_addr_t addr, phys_addr_t end)
270 {
271 	phys_addr_t next, start_addr = addr;
272 	pmd_t *pmd, *start_pmd;
273 
274 	start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
275 	do {
276 		next = stage2_pmd_addr_end(kvm, addr, end);
277 		if (!pmd_none(*pmd)) {
278 			if (pmd_thp_or_huge(*pmd)) {
279 				pmd_t old_pmd = *pmd;
280 
281 				pmd_clear(pmd);
282 				kvm_tlb_flush_vmid_ipa(kvm, addr);
283 
284 				kvm_flush_dcache_pmd(old_pmd);
285 
286 				put_page(virt_to_page(pmd));
287 			} else {
288 				unmap_stage2_ptes(kvm, pmd, addr, next);
289 			}
290 		}
291 	} while (pmd++, addr = next, addr != end);
292 
293 	if (stage2_pmd_table_empty(kvm, start_pmd))
294 		clear_stage2_pud_entry(kvm, pud, start_addr);
295 }
296 
unmap_stage2_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)297 static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
298 		       phys_addr_t addr, phys_addr_t end)
299 {
300 	phys_addr_t next, start_addr = addr;
301 	pud_t *pud, *start_pud;
302 
303 	start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
304 	do {
305 		next = stage2_pud_addr_end(kvm, addr, end);
306 		if (!stage2_pud_none(kvm, *pud)) {
307 			if (stage2_pud_huge(kvm, *pud)) {
308 				pud_t old_pud = *pud;
309 
310 				stage2_pud_clear(kvm, pud);
311 				kvm_tlb_flush_vmid_ipa(kvm, addr);
312 				kvm_flush_dcache_pud(old_pud);
313 				put_page(virt_to_page(pud));
314 			} else {
315 				unmap_stage2_pmds(kvm, pud, addr, next);
316 			}
317 		}
318 	} while (pud++, addr = next, addr != end);
319 
320 	if (stage2_pud_table_empty(kvm, start_pud))
321 		clear_stage2_pgd_entry(kvm, pgd, start_addr);
322 }
323 
324 /**
325  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
326  * @kvm:   The VM pointer
327  * @start: The intermediate physical base address of the range to unmap
328  * @size:  The size of the area to unmap
329  *
330  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
331  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
332  * destroying the VM), otherwise another faulting VCPU may come in and mess
333  * with things behind our backs.
334  */
__unmap_stage2_range(struct kvm * kvm,phys_addr_t start,u64 size,bool may_block)335 static void __unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size,
336 				 bool may_block)
337 {
338 	pgd_t *pgd;
339 	phys_addr_t addr = start, end = start + size;
340 	phys_addr_t next;
341 
342 	assert_spin_locked(&kvm->mmu_lock);
343 	WARN_ON(size & ~PAGE_MASK);
344 
345 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
346 	do {
347 		/*
348 		 * Make sure the page table is still active, as another thread
349 		 * could have possibly freed the page table, while we released
350 		 * the lock.
351 		 */
352 		if (!READ_ONCE(kvm->arch.pgd))
353 			break;
354 		next = stage2_pgd_addr_end(kvm, addr, end);
355 		if (!stage2_pgd_none(kvm, *pgd))
356 			unmap_stage2_puds(kvm, pgd, addr, next);
357 		/*
358 		 * If the range is too large, release the kvm->mmu_lock
359 		 * to prevent starvation and lockup detector warnings.
360 		 */
361 		if (may_block && next != end)
362 			cond_resched_lock(&kvm->mmu_lock);
363 	} while (pgd++, addr = next, addr != end);
364 }
365 
unmap_stage2_range(struct kvm * kvm,phys_addr_t start,u64 size)366 static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
367 {
368 	__unmap_stage2_range(kvm, start, size, true);
369 }
370 
stage2_flush_ptes(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr,phys_addr_t end)371 static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
372 			      phys_addr_t addr, phys_addr_t end)
373 {
374 	pte_t *pte;
375 
376 	pte = pte_offset_kernel(pmd, addr);
377 	do {
378 		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
379 			kvm_flush_dcache_pte(*pte);
380 	} while (pte++, addr += PAGE_SIZE, addr != end);
381 }
382 
stage2_flush_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)383 static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
384 			      phys_addr_t addr, phys_addr_t end)
385 {
386 	pmd_t *pmd;
387 	phys_addr_t next;
388 
389 	pmd = stage2_pmd_offset(kvm, pud, addr);
390 	do {
391 		next = stage2_pmd_addr_end(kvm, addr, end);
392 		if (!pmd_none(*pmd)) {
393 			if (pmd_thp_or_huge(*pmd))
394 				kvm_flush_dcache_pmd(*pmd);
395 			else
396 				stage2_flush_ptes(kvm, pmd, addr, next);
397 		}
398 	} while (pmd++, addr = next, addr != end);
399 }
400 
stage2_flush_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)401 static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
402 			      phys_addr_t addr, phys_addr_t end)
403 {
404 	pud_t *pud;
405 	phys_addr_t next;
406 
407 	pud = stage2_pud_offset(kvm, pgd, addr);
408 	do {
409 		next = stage2_pud_addr_end(kvm, addr, end);
410 		if (!stage2_pud_none(kvm, *pud)) {
411 			if (stage2_pud_huge(kvm, *pud))
412 				kvm_flush_dcache_pud(*pud);
413 			else
414 				stage2_flush_pmds(kvm, pud, addr, next);
415 		}
416 	} while (pud++, addr = next, addr != end);
417 }
418 
stage2_flush_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)419 static void stage2_flush_memslot(struct kvm *kvm,
420 				 struct kvm_memory_slot *memslot)
421 {
422 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
423 	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
424 	phys_addr_t next;
425 	pgd_t *pgd;
426 
427 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
428 	do {
429 		next = stage2_pgd_addr_end(kvm, addr, end);
430 		if (!stage2_pgd_none(kvm, *pgd))
431 			stage2_flush_puds(kvm, pgd, addr, next);
432 	} while (pgd++, addr = next, addr != end);
433 }
434 
435 /**
436  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
437  * @kvm: The struct kvm pointer
438  *
439  * Go through the stage 2 page tables and invalidate any cache lines
440  * backing memory already mapped to the VM.
441  */
stage2_flush_vm(struct kvm * kvm)442 static void stage2_flush_vm(struct kvm *kvm)
443 {
444 	struct kvm_memslots *slots;
445 	struct kvm_memory_slot *memslot;
446 	int idx;
447 
448 	idx = srcu_read_lock(&kvm->srcu);
449 	spin_lock(&kvm->mmu_lock);
450 
451 	slots = kvm_memslots(kvm);
452 	kvm_for_each_memslot(memslot, slots)
453 		stage2_flush_memslot(kvm, memslot);
454 
455 	spin_unlock(&kvm->mmu_lock);
456 	srcu_read_unlock(&kvm->srcu, idx);
457 }
458 
clear_hyp_pgd_entry(pgd_t * pgd)459 static void clear_hyp_pgd_entry(pgd_t *pgd)
460 {
461 	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
462 	pgd_clear(pgd);
463 	pud_free(NULL, pud_table);
464 	put_page(virt_to_page(pgd));
465 }
466 
clear_hyp_pud_entry(pud_t * pud)467 static void clear_hyp_pud_entry(pud_t *pud)
468 {
469 	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
470 	VM_BUG_ON(pud_huge(*pud));
471 	pud_clear(pud);
472 	pmd_free(NULL, pmd_table);
473 	put_page(virt_to_page(pud));
474 }
475 
clear_hyp_pmd_entry(pmd_t * pmd)476 static void clear_hyp_pmd_entry(pmd_t *pmd)
477 {
478 	pte_t *pte_table = pte_offset_kernel(pmd, 0);
479 	VM_BUG_ON(pmd_thp_or_huge(*pmd));
480 	pmd_clear(pmd);
481 	pte_free_kernel(NULL, pte_table);
482 	put_page(virt_to_page(pmd));
483 }
484 
unmap_hyp_ptes(pmd_t * pmd,phys_addr_t addr,phys_addr_t end)485 static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
486 {
487 	pte_t *pte, *start_pte;
488 
489 	start_pte = pte = pte_offset_kernel(pmd, addr);
490 	do {
491 		if (!pte_none(*pte)) {
492 			kvm_set_pte(pte, __pte(0));
493 			put_page(virt_to_page(pte));
494 		}
495 	} while (pte++, addr += PAGE_SIZE, addr != end);
496 
497 	if (hyp_pte_table_empty(start_pte))
498 		clear_hyp_pmd_entry(pmd);
499 }
500 
unmap_hyp_pmds(pud_t * pud,phys_addr_t addr,phys_addr_t end)501 static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
502 {
503 	phys_addr_t next;
504 	pmd_t *pmd, *start_pmd;
505 
506 	start_pmd = pmd = pmd_offset(pud, addr);
507 	do {
508 		next = pmd_addr_end(addr, end);
509 		/* Hyp doesn't use huge pmds */
510 		if (!pmd_none(*pmd))
511 			unmap_hyp_ptes(pmd, addr, next);
512 	} while (pmd++, addr = next, addr != end);
513 
514 	if (hyp_pmd_table_empty(start_pmd))
515 		clear_hyp_pud_entry(pud);
516 }
517 
unmap_hyp_puds(pgd_t * pgd,phys_addr_t addr,phys_addr_t end)518 static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
519 {
520 	phys_addr_t next;
521 	pud_t *pud, *start_pud;
522 
523 	start_pud = pud = pud_offset(pgd, addr);
524 	do {
525 		next = pud_addr_end(addr, end);
526 		/* Hyp doesn't use huge puds */
527 		if (!pud_none(*pud))
528 			unmap_hyp_pmds(pud, addr, next);
529 	} while (pud++, addr = next, addr != end);
530 
531 	if (hyp_pud_table_empty(start_pud))
532 		clear_hyp_pgd_entry(pgd);
533 }
534 
kvm_pgd_index(unsigned long addr,unsigned int ptrs_per_pgd)535 static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
536 {
537 	return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
538 }
539 
__unmap_hyp_range(pgd_t * pgdp,unsigned long ptrs_per_pgd,phys_addr_t start,u64 size)540 static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
541 			      phys_addr_t start, u64 size)
542 {
543 	pgd_t *pgd;
544 	phys_addr_t addr = start, end = start + size;
545 	phys_addr_t next;
546 
547 	/*
548 	 * We don't unmap anything from HYP, except at the hyp tear down.
549 	 * Hence, we don't have to invalidate the TLBs here.
550 	 */
551 	pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
552 	do {
553 		next = pgd_addr_end(addr, end);
554 		if (!pgd_none(*pgd))
555 			unmap_hyp_puds(pgd, addr, next);
556 	} while (pgd++, addr = next, addr != end);
557 }
558 
unmap_hyp_range(pgd_t * pgdp,phys_addr_t start,u64 size)559 static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
560 {
561 	__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
562 }
563 
unmap_hyp_idmap_range(pgd_t * pgdp,phys_addr_t start,u64 size)564 static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
565 {
566 	__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
567 }
568 
569 /**
570  * free_hyp_pgds - free Hyp-mode page tables
571  *
572  * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
573  * therefore contains either mappings in the kernel memory area (above
574  * PAGE_OFFSET), or device mappings in the idmap range.
575  *
576  * boot_hyp_pgd should only map the idmap range, and is only used in
577  * the extended idmap case.
578  */
free_hyp_pgds(void)579 void free_hyp_pgds(void)
580 {
581 	pgd_t *id_pgd;
582 
583 	mutex_lock(&kvm_hyp_pgd_mutex);
584 
585 	id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
586 
587 	if (id_pgd) {
588 		/* In case we never called hyp_mmu_init() */
589 		if (!io_map_base)
590 			io_map_base = hyp_idmap_start;
591 		unmap_hyp_idmap_range(id_pgd, io_map_base,
592 				      hyp_idmap_start + PAGE_SIZE - io_map_base);
593 	}
594 
595 	if (boot_hyp_pgd) {
596 		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
597 		boot_hyp_pgd = NULL;
598 	}
599 
600 	if (hyp_pgd) {
601 		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
602 				(uintptr_t)high_memory - PAGE_OFFSET);
603 
604 		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
605 		hyp_pgd = NULL;
606 	}
607 	if (merged_hyp_pgd) {
608 		clear_page(merged_hyp_pgd);
609 		free_page((unsigned long)merged_hyp_pgd);
610 		merged_hyp_pgd = NULL;
611 	}
612 
613 	mutex_unlock(&kvm_hyp_pgd_mutex);
614 }
615 
create_hyp_pte_mappings(pmd_t * pmd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)616 static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
617 				    unsigned long end, unsigned long pfn,
618 				    pgprot_t prot)
619 {
620 	pte_t *pte;
621 	unsigned long addr;
622 
623 	addr = start;
624 	do {
625 		pte = pte_offset_kernel(pmd, addr);
626 		kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
627 		get_page(virt_to_page(pte));
628 		pfn++;
629 	} while (addr += PAGE_SIZE, addr != end);
630 }
631 
create_hyp_pmd_mappings(pud_t * pud,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)632 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
633 				   unsigned long end, unsigned long pfn,
634 				   pgprot_t prot)
635 {
636 	pmd_t *pmd;
637 	pte_t *pte;
638 	unsigned long addr, next;
639 
640 	addr = start;
641 	do {
642 		pmd = pmd_offset(pud, addr);
643 
644 		BUG_ON(pmd_sect(*pmd));
645 
646 		if (pmd_none(*pmd)) {
647 			pte = pte_alloc_one_kernel(NULL);
648 			if (!pte) {
649 				kvm_err("Cannot allocate Hyp pte\n");
650 				return -ENOMEM;
651 			}
652 			kvm_pmd_populate(pmd, pte);
653 			get_page(virt_to_page(pmd));
654 		}
655 
656 		next = pmd_addr_end(addr, end);
657 
658 		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
659 		pfn += (next - addr) >> PAGE_SHIFT;
660 	} while (addr = next, addr != end);
661 
662 	return 0;
663 }
664 
create_hyp_pud_mappings(pgd_t * pgd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)665 static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
666 				   unsigned long end, unsigned long pfn,
667 				   pgprot_t prot)
668 {
669 	pud_t *pud;
670 	pmd_t *pmd;
671 	unsigned long addr, next;
672 	int ret;
673 
674 	addr = start;
675 	do {
676 		pud = pud_offset(pgd, addr);
677 
678 		if (pud_none_or_clear_bad(pud)) {
679 			pmd = pmd_alloc_one(NULL, addr);
680 			if (!pmd) {
681 				kvm_err("Cannot allocate Hyp pmd\n");
682 				return -ENOMEM;
683 			}
684 			kvm_pud_populate(pud, pmd);
685 			get_page(virt_to_page(pud));
686 		}
687 
688 		next = pud_addr_end(addr, end);
689 		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
690 		if (ret)
691 			return ret;
692 		pfn += (next - addr) >> PAGE_SHIFT;
693 	} while (addr = next, addr != end);
694 
695 	return 0;
696 }
697 
__create_hyp_mappings(pgd_t * pgdp,unsigned long ptrs_per_pgd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)698 static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
699 				 unsigned long start, unsigned long end,
700 				 unsigned long pfn, pgprot_t prot)
701 {
702 	pgd_t *pgd;
703 	pud_t *pud;
704 	unsigned long addr, next;
705 	int err = 0;
706 
707 	mutex_lock(&kvm_hyp_pgd_mutex);
708 	addr = start & PAGE_MASK;
709 	end = PAGE_ALIGN(end);
710 	do {
711 		pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
712 
713 		if (pgd_none(*pgd)) {
714 			pud = pud_alloc_one(NULL, addr);
715 			if (!pud) {
716 				kvm_err("Cannot allocate Hyp pud\n");
717 				err = -ENOMEM;
718 				goto out;
719 			}
720 			kvm_pgd_populate(pgd, pud);
721 			get_page(virt_to_page(pgd));
722 		}
723 
724 		next = pgd_addr_end(addr, end);
725 		err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
726 		if (err)
727 			goto out;
728 		pfn += (next - addr) >> PAGE_SHIFT;
729 	} while (addr = next, addr != end);
730 out:
731 	mutex_unlock(&kvm_hyp_pgd_mutex);
732 	return err;
733 }
734 
kvm_kaddr_to_phys(void * kaddr)735 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
736 {
737 	if (!is_vmalloc_addr(kaddr)) {
738 		BUG_ON(!virt_addr_valid(kaddr));
739 		return __pa(kaddr);
740 	} else {
741 		return page_to_phys(vmalloc_to_page(kaddr)) +
742 		       offset_in_page(kaddr);
743 	}
744 }
745 
746 /**
747  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
748  * @from:	The virtual kernel start address of the range
749  * @to:		The virtual kernel end address of the range (exclusive)
750  * @prot:	The protection to be applied to this range
751  *
752  * The same virtual address as the kernel virtual address is also used
753  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
754  * physical pages.
755  */
create_hyp_mappings(void * from,void * to,pgprot_t prot)756 int create_hyp_mappings(void *from, void *to, pgprot_t prot)
757 {
758 	phys_addr_t phys_addr;
759 	unsigned long virt_addr;
760 	unsigned long start = kern_hyp_va((unsigned long)from);
761 	unsigned long end = kern_hyp_va((unsigned long)to);
762 
763 	if (is_kernel_in_hyp_mode())
764 		return 0;
765 
766 	start = start & PAGE_MASK;
767 	end = PAGE_ALIGN(end);
768 
769 	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
770 		int err;
771 
772 		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
773 		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
774 					    virt_addr, virt_addr + PAGE_SIZE,
775 					    __phys_to_pfn(phys_addr),
776 					    prot);
777 		if (err)
778 			return err;
779 	}
780 
781 	return 0;
782 }
783 
__create_hyp_private_mapping(phys_addr_t phys_addr,size_t size,unsigned long * haddr,pgprot_t prot)784 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
785 					unsigned long *haddr, pgprot_t prot)
786 {
787 	pgd_t *pgd = hyp_pgd;
788 	unsigned long base;
789 	int ret = 0;
790 
791 	mutex_lock(&kvm_hyp_pgd_mutex);
792 
793 	/*
794 	 * This assumes that we we have enough space below the idmap
795 	 * page to allocate our VAs. If not, the check below will
796 	 * kick. A potential alternative would be to detect that
797 	 * overflow and switch to an allocation above the idmap.
798 	 *
799 	 * The allocated size is always a multiple of PAGE_SIZE.
800 	 */
801 	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
802 	base = io_map_base - size;
803 
804 	/*
805 	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
806 	 * allocating the new area, as it would indicate we've
807 	 * overflowed the idmap/IO address range.
808 	 */
809 	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
810 		ret = -ENOMEM;
811 	else
812 		io_map_base = base;
813 
814 	mutex_unlock(&kvm_hyp_pgd_mutex);
815 
816 	if (ret)
817 		goto out;
818 
819 	if (__kvm_cpu_uses_extended_idmap())
820 		pgd = boot_hyp_pgd;
821 
822 	ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
823 				    base, base + size,
824 				    __phys_to_pfn(phys_addr), prot);
825 	if (ret)
826 		goto out;
827 
828 	*haddr = base + offset_in_page(phys_addr);
829 
830 out:
831 	return ret;
832 }
833 
834 /**
835  * create_hyp_io_mappings - Map IO into both kernel and HYP
836  * @phys_addr:	The physical start address which gets mapped
837  * @size:	Size of the region being mapped
838  * @kaddr:	Kernel VA for this mapping
839  * @haddr:	HYP VA for this mapping
840  */
create_hyp_io_mappings(phys_addr_t phys_addr,size_t size,void __iomem ** kaddr,void __iomem ** haddr)841 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
842 			   void __iomem **kaddr,
843 			   void __iomem **haddr)
844 {
845 	unsigned long addr;
846 	int ret;
847 
848 	*kaddr = ioremap(phys_addr, size);
849 	if (!*kaddr)
850 		return -ENOMEM;
851 
852 	if (is_kernel_in_hyp_mode()) {
853 		*haddr = *kaddr;
854 		return 0;
855 	}
856 
857 	ret = __create_hyp_private_mapping(phys_addr, size,
858 					   &addr, PAGE_HYP_DEVICE);
859 	if (ret) {
860 		iounmap(*kaddr);
861 		*kaddr = NULL;
862 		*haddr = NULL;
863 		return ret;
864 	}
865 
866 	*haddr = (void __iomem *)addr;
867 	return 0;
868 }
869 
870 /**
871  * create_hyp_exec_mappings - Map an executable range into HYP
872  * @phys_addr:	The physical start address which gets mapped
873  * @size:	Size of the region being mapped
874  * @haddr:	HYP VA for this mapping
875  */
create_hyp_exec_mappings(phys_addr_t phys_addr,size_t size,void ** haddr)876 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
877 			     void **haddr)
878 {
879 	unsigned long addr;
880 	int ret;
881 
882 	BUG_ON(is_kernel_in_hyp_mode());
883 
884 	ret = __create_hyp_private_mapping(phys_addr, size,
885 					   &addr, PAGE_HYP_EXEC);
886 	if (ret) {
887 		*haddr = NULL;
888 		return ret;
889 	}
890 
891 	*haddr = (void *)addr;
892 	return 0;
893 }
894 
895 /**
896  * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
897  * @kvm:	The KVM struct pointer for the VM.
898  *
899  * Allocates only the stage-2 HW PGD level table(s) of size defined by
900  * stage2_pgd_size(kvm).
901  *
902  * Note we don't need locking here as this is only called when the VM is
903  * created, which can only be done once.
904  */
kvm_alloc_stage2_pgd(struct kvm * kvm)905 int kvm_alloc_stage2_pgd(struct kvm *kvm)
906 {
907 	phys_addr_t pgd_phys;
908 	pgd_t *pgd;
909 
910 	if (kvm->arch.pgd != NULL) {
911 		kvm_err("kvm_arch already initialized?\n");
912 		return -EINVAL;
913 	}
914 
915 	/* Allocate the HW PGD, making sure that each page gets its own refcount */
916 	pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
917 	if (!pgd)
918 		return -ENOMEM;
919 
920 	pgd_phys = virt_to_phys(pgd);
921 	if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
922 		return -EINVAL;
923 
924 	kvm->arch.pgd = pgd;
925 	kvm->arch.pgd_phys = pgd_phys;
926 	return 0;
927 }
928 
stage2_unmap_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)929 static void stage2_unmap_memslot(struct kvm *kvm,
930 				 struct kvm_memory_slot *memslot)
931 {
932 	hva_t hva = memslot->userspace_addr;
933 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
934 	phys_addr_t size = PAGE_SIZE * memslot->npages;
935 	hva_t reg_end = hva + size;
936 
937 	/*
938 	 * A memory region could potentially cover multiple VMAs, and any holes
939 	 * between them, so iterate over all of them to find out if we should
940 	 * unmap any of them.
941 	 *
942 	 *     +--------------------------------------------+
943 	 * +---------------+----------------+   +----------------+
944 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
945 	 * +---------------+----------------+   +----------------+
946 	 *     |               memory region                |
947 	 *     +--------------------------------------------+
948 	 */
949 	do {
950 		struct vm_area_struct *vma = find_vma(current->mm, hva);
951 		hva_t vm_start, vm_end;
952 
953 		if (!vma || vma->vm_start >= reg_end)
954 			break;
955 
956 		/*
957 		 * Take the intersection of this VMA with the memory region
958 		 */
959 		vm_start = max(hva, vma->vm_start);
960 		vm_end = min(reg_end, vma->vm_end);
961 
962 		if (!(vma->vm_flags & VM_PFNMAP)) {
963 			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
964 			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
965 		}
966 		hva = vm_end;
967 	} while (hva < reg_end);
968 }
969 
970 /**
971  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
972  * @kvm: The struct kvm pointer
973  *
974  * Go through the memregions and unmap any reguler RAM
975  * backing memory already mapped to the VM.
976  */
stage2_unmap_vm(struct kvm * kvm)977 void stage2_unmap_vm(struct kvm *kvm)
978 {
979 	struct kvm_memslots *slots;
980 	struct kvm_memory_slot *memslot;
981 	int idx;
982 
983 	idx = srcu_read_lock(&kvm->srcu);
984 	down_read(&current->mm->mmap_sem);
985 	spin_lock(&kvm->mmu_lock);
986 
987 	slots = kvm_memslots(kvm);
988 	kvm_for_each_memslot(memslot, slots)
989 		stage2_unmap_memslot(kvm, memslot);
990 
991 	spin_unlock(&kvm->mmu_lock);
992 	up_read(&current->mm->mmap_sem);
993 	srcu_read_unlock(&kvm->srcu, idx);
994 }
995 
996 /**
997  * kvm_free_stage2_pgd - free all stage-2 tables
998  * @kvm:	The KVM struct pointer for the VM.
999  *
1000  * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
1001  * underlying level-2 and level-3 tables before freeing the actual level-1 table
1002  * and setting the struct pointer to NULL.
1003  */
kvm_free_stage2_pgd(struct kvm * kvm)1004 void kvm_free_stage2_pgd(struct kvm *kvm)
1005 {
1006 	void *pgd = NULL;
1007 
1008 	spin_lock(&kvm->mmu_lock);
1009 	if (kvm->arch.pgd) {
1010 		unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
1011 		pgd = READ_ONCE(kvm->arch.pgd);
1012 		kvm->arch.pgd = NULL;
1013 		kvm->arch.pgd_phys = 0;
1014 	}
1015 	spin_unlock(&kvm->mmu_lock);
1016 
1017 	/* Free the HW pgd, one page at a time */
1018 	if (pgd)
1019 		free_pages_exact(pgd, stage2_pgd_size(kvm));
1020 }
1021 
stage2_get_pud(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr)1022 static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1023 			     phys_addr_t addr)
1024 {
1025 	pgd_t *pgd;
1026 	pud_t *pud;
1027 
1028 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
1029 	if (stage2_pgd_none(kvm, *pgd)) {
1030 		if (!cache)
1031 			return NULL;
1032 		pud = mmu_memory_cache_alloc(cache);
1033 		stage2_pgd_populate(kvm, pgd, pud);
1034 		get_page(virt_to_page(pgd));
1035 	}
1036 
1037 	return stage2_pud_offset(kvm, pgd, addr);
1038 }
1039 
stage2_get_pmd(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr)1040 static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1041 			     phys_addr_t addr)
1042 {
1043 	pud_t *pud;
1044 	pmd_t *pmd;
1045 
1046 	pud = stage2_get_pud(kvm, cache, addr);
1047 	if (!pud || stage2_pud_huge(kvm, *pud))
1048 		return NULL;
1049 
1050 	if (stage2_pud_none(kvm, *pud)) {
1051 		if (!cache)
1052 			return NULL;
1053 		pmd = mmu_memory_cache_alloc(cache);
1054 		stage2_pud_populate(kvm, pud, pmd);
1055 		get_page(virt_to_page(pud));
1056 	}
1057 
1058 	return stage2_pmd_offset(kvm, pud, addr);
1059 }
1060 
stage2_set_pmd_huge(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pmd_t * new_pmd)1061 static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1062 			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
1063 {
1064 	pmd_t *pmd, old_pmd;
1065 
1066 retry:
1067 	pmd = stage2_get_pmd(kvm, cache, addr);
1068 	VM_BUG_ON(!pmd);
1069 
1070 	old_pmd = *pmd;
1071 	/*
1072 	 * Multiple vcpus faulting on the same PMD entry, can
1073 	 * lead to them sequentially updating the PMD with the
1074 	 * same value. Following the break-before-make
1075 	 * (pmd_clear() followed by tlb_flush()) process can
1076 	 * hinder forward progress due to refaults generated
1077 	 * on missing translations.
1078 	 *
1079 	 * Skip updating the page table if the entry is
1080 	 * unchanged.
1081 	 */
1082 	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1083 		return 0;
1084 
1085 	if (pmd_present(old_pmd)) {
1086 		/*
1087 		 * If we already have PTE level mapping for this block,
1088 		 * we must unmap it to avoid inconsistent TLB state and
1089 		 * leaking the table page. We could end up in this situation
1090 		 * if the memory slot was marked for dirty logging and was
1091 		 * reverted, leaving PTE level mappings for the pages accessed
1092 		 * during the period. So, unmap the PTE level mapping for this
1093 		 * block and retry, as we could have released the upper level
1094 		 * table in the process.
1095 		 *
1096 		 * Normal THP split/merge follows mmu_notifier callbacks and do
1097 		 * get handled accordingly.
1098 		 */
1099 		if (!pmd_thp_or_huge(old_pmd)) {
1100 			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
1101 			goto retry;
1102 		}
1103 		/*
1104 		 * Mapping in huge pages should only happen through a
1105 		 * fault.  If a page is merged into a transparent huge
1106 		 * page, the individual subpages of that huge page
1107 		 * should be unmapped through MMU notifiers before we
1108 		 * get here.
1109 		 *
1110 		 * Merging of CompoundPages is not supported; they
1111 		 * should become splitting first, unmapped, merged,
1112 		 * and mapped back in on-demand.
1113 		 */
1114 		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
1115 		pmd_clear(pmd);
1116 		kvm_tlb_flush_vmid_ipa(kvm, addr);
1117 	} else {
1118 		get_page(virt_to_page(pmd));
1119 	}
1120 
1121 	kvm_set_pmd(pmd, *new_pmd);
1122 	return 0;
1123 }
1124 
stage2_set_pud_huge(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pud_t * new_pudp)1125 static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1126 			       phys_addr_t addr, const pud_t *new_pudp)
1127 {
1128 	pud_t *pudp, old_pud;
1129 
1130 retry:
1131 	pudp = stage2_get_pud(kvm, cache, addr);
1132 	VM_BUG_ON(!pudp);
1133 
1134 	old_pud = *pudp;
1135 
1136 	/*
1137 	 * A large number of vcpus faulting on the same stage 2 entry,
1138 	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1139 	 * Skip updating the page tables if there is no change.
1140 	 */
1141 	if (pud_val(old_pud) == pud_val(*new_pudp))
1142 		return 0;
1143 
1144 	if (stage2_pud_present(kvm, old_pud)) {
1145 		/*
1146 		 * If we already have table level mapping for this block, unmap
1147 		 * the range for this block and retry.
1148 		 */
1149 		if (!stage2_pud_huge(kvm, old_pud)) {
1150 			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
1151 			goto retry;
1152 		}
1153 
1154 		WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
1155 		stage2_pud_clear(kvm, pudp);
1156 		kvm_tlb_flush_vmid_ipa(kvm, addr);
1157 	} else {
1158 		get_page(virt_to_page(pudp));
1159 	}
1160 
1161 	kvm_set_pud(pudp, *new_pudp);
1162 	return 0;
1163 }
1164 
1165 /*
1166  * stage2_get_leaf_entry - walk the stage2 VM page tables and return
1167  * true if a valid and present leaf-entry is found. A pointer to the
1168  * leaf-entry is returned in the appropriate level variable - pudpp,
1169  * pmdpp, ptepp.
1170  */
stage2_get_leaf_entry(struct kvm * kvm,phys_addr_t addr,pud_t ** pudpp,pmd_t ** pmdpp,pte_t ** ptepp)1171 static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
1172 				  pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
1173 {
1174 	pud_t *pudp;
1175 	pmd_t *pmdp;
1176 	pte_t *ptep;
1177 
1178 	*pudpp = NULL;
1179 	*pmdpp = NULL;
1180 	*ptepp = NULL;
1181 
1182 	pudp = stage2_get_pud(kvm, NULL, addr);
1183 	if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1184 		return false;
1185 
1186 	if (stage2_pud_huge(kvm, *pudp)) {
1187 		*pudpp = pudp;
1188 		return true;
1189 	}
1190 
1191 	pmdp = stage2_pmd_offset(kvm, pudp, addr);
1192 	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1193 		return false;
1194 
1195 	if (pmd_thp_or_huge(*pmdp)) {
1196 		*pmdpp = pmdp;
1197 		return true;
1198 	}
1199 
1200 	ptep = pte_offset_kernel(pmdp, addr);
1201 	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1202 		return false;
1203 
1204 	*ptepp = ptep;
1205 	return true;
1206 }
1207 
stage2_is_exec(struct kvm * kvm,phys_addr_t addr,unsigned long sz)1208 static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr, unsigned long sz)
1209 {
1210 	pud_t *pudp;
1211 	pmd_t *pmdp;
1212 	pte_t *ptep;
1213 	bool found;
1214 
1215 	found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
1216 	if (!found)
1217 		return false;
1218 
1219 	if (pudp)
1220 		return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
1221 	else if (pmdp)
1222 		return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
1223 	else
1224 		return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
1225 }
1226 
stage2_set_pte(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pte_t * new_pte,unsigned long flags)1227 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1228 			  phys_addr_t addr, const pte_t *new_pte,
1229 			  unsigned long flags)
1230 {
1231 	pud_t *pud;
1232 	pmd_t *pmd;
1233 	pte_t *pte, old_pte;
1234 	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1235 	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1236 
1237 	VM_BUG_ON(logging_active && !cache);
1238 
1239 	/* Create stage-2 page table mapping - Levels 0 and 1 */
1240 	pud = stage2_get_pud(kvm, cache, addr);
1241 	if (!pud) {
1242 		/*
1243 		 * Ignore calls from kvm_set_spte_hva for unallocated
1244 		 * address ranges.
1245 		 */
1246 		return 0;
1247 	}
1248 
1249 	/*
1250 	 * While dirty page logging - dissolve huge PUD, then continue
1251 	 * on to allocate page.
1252 	 */
1253 	if (logging_active)
1254 		stage2_dissolve_pud(kvm, addr, pud);
1255 
1256 	if (stage2_pud_none(kvm, *pud)) {
1257 		if (!cache)
1258 			return 0; /* ignore calls from kvm_set_spte_hva */
1259 		pmd = mmu_memory_cache_alloc(cache);
1260 		stage2_pud_populate(kvm, pud, pmd);
1261 		get_page(virt_to_page(pud));
1262 	}
1263 
1264 	pmd = stage2_pmd_offset(kvm, pud, addr);
1265 	if (!pmd) {
1266 		/*
1267 		 * Ignore calls from kvm_set_spte_hva for unallocated
1268 		 * address ranges.
1269 		 */
1270 		return 0;
1271 	}
1272 
1273 	/*
1274 	 * While dirty page logging - dissolve huge PMD, then continue on to
1275 	 * allocate page.
1276 	 */
1277 	if (logging_active)
1278 		stage2_dissolve_pmd(kvm, addr, pmd);
1279 
1280 	/* Create stage-2 page mappings - Level 2 */
1281 	if (pmd_none(*pmd)) {
1282 		if (!cache)
1283 			return 0; /* ignore calls from kvm_set_spte_hva */
1284 		pte = mmu_memory_cache_alloc(cache);
1285 		kvm_pmd_populate(pmd, pte);
1286 		get_page(virt_to_page(pmd));
1287 	}
1288 
1289 	pte = pte_offset_kernel(pmd, addr);
1290 
1291 	if (iomap && pte_present(*pte))
1292 		return -EFAULT;
1293 
1294 	/* Create 2nd stage page table mapping - Level 3 */
1295 	old_pte = *pte;
1296 	if (pte_present(old_pte)) {
1297 		/* Skip page table update if there is no change */
1298 		if (pte_val(old_pte) == pte_val(*new_pte))
1299 			return 0;
1300 
1301 		kvm_set_pte(pte, __pte(0));
1302 		kvm_tlb_flush_vmid_ipa(kvm, addr);
1303 	} else {
1304 		get_page(virt_to_page(pte));
1305 	}
1306 
1307 	kvm_set_pte(pte, *new_pte);
1308 	return 0;
1309 }
1310 
1311 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
stage2_ptep_test_and_clear_young(pte_t * pte)1312 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1313 {
1314 	if (pte_young(*pte)) {
1315 		*pte = pte_mkold(*pte);
1316 		return 1;
1317 	}
1318 	return 0;
1319 }
1320 #else
stage2_ptep_test_and_clear_young(pte_t * pte)1321 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1322 {
1323 	return __ptep_test_and_clear_young(pte);
1324 }
1325 #endif
1326 
stage2_pmdp_test_and_clear_young(pmd_t * pmd)1327 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1328 {
1329 	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1330 }
1331 
stage2_pudp_test_and_clear_young(pud_t * pud)1332 static int stage2_pudp_test_and_clear_young(pud_t *pud)
1333 {
1334 	return stage2_ptep_test_and_clear_young((pte_t *)pud);
1335 }
1336 
1337 /**
1338  * kvm_phys_addr_ioremap - map a device range to guest IPA
1339  *
1340  * @kvm:	The KVM pointer
1341  * @guest_ipa:	The IPA at which to insert the mapping
1342  * @pa:		The physical address of the device
1343  * @size:	The size of the mapping
1344  */
kvm_phys_addr_ioremap(struct kvm * kvm,phys_addr_t guest_ipa,phys_addr_t pa,unsigned long size,bool writable)1345 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1346 			  phys_addr_t pa, unsigned long size, bool writable)
1347 {
1348 	phys_addr_t addr, end;
1349 	int ret = 0;
1350 	unsigned long pfn;
1351 	struct kvm_mmu_memory_cache cache = { 0, };
1352 
1353 	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
1354 	pfn = __phys_to_pfn(pa);
1355 
1356 	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
1357 		pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
1358 
1359 		if (writable)
1360 			pte = kvm_s2pte_mkwrite(pte);
1361 
1362 		ret = mmu_topup_memory_cache(&cache,
1363 					     kvm_mmu_cache_min_pages(kvm),
1364 					     KVM_NR_MEM_OBJS);
1365 		if (ret)
1366 			goto out;
1367 		spin_lock(&kvm->mmu_lock);
1368 		ret = stage2_set_pte(kvm, &cache, addr, &pte,
1369 						KVM_S2PTE_FLAG_IS_IOMAP);
1370 		spin_unlock(&kvm->mmu_lock);
1371 		if (ret)
1372 			goto out;
1373 
1374 		pfn++;
1375 	}
1376 
1377 out:
1378 	mmu_free_memory_cache(&cache);
1379 	return ret;
1380 }
1381 
transparent_hugepage_adjust(kvm_pfn_t * pfnp,phys_addr_t * ipap)1382 static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
1383 {
1384 	kvm_pfn_t pfn = *pfnp;
1385 	gfn_t gfn = *ipap >> PAGE_SHIFT;
1386 	struct page *page = pfn_to_page(pfn);
1387 
1388 	/*
1389 	 * PageTransCompoundMap() returns true for THP and
1390 	 * hugetlbfs. Make sure the adjustment is done only for THP
1391 	 * pages.
1392 	 */
1393 	if (!PageHuge(page) && PageTransCompoundMap(page)) {
1394 		unsigned long mask;
1395 		/*
1396 		 * The address we faulted on is backed by a transparent huge
1397 		 * page.  However, because we map the compound huge page and
1398 		 * not the individual tail page, we need to transfer the
1399 		 * refcount to the head page.  We have to be careful that the
1400 		 * THP doesn't start to split while we are adjusting the
1401 		 * refcounts.
1402 		 *
1403 		 * We are sure this doesn't happen, because mmu_notifier_retry
1404 		 * was successful and we are holding the mmu_lock, so if this
1405 		 * THP is trying to split, it will be blocked in the mmu
1406 		 * notifier before touching any of the pages, specifically
1407 		 * before being able to call __split_huge_page_refcount().
1408 		 *
1409 		 * We can therefore safely transfer the refcount from PG_tail
1410 		 * to PG_head and switch the pfn from a tail page to the head
1411 		 * page accordingly.
1412 		 */
1413 		mask = PTRS_PER_PMD - 1;
1414 		VM_BUG_ON((gfn & mask) != (pfn & mask));
1415 		if (pfn & mask) {
1416 			*ipap &= PMD_MASK;
1417 			kvm_release_pfn_clean(pfn);
1418 			pfn &= ~mask;
1419 			kvm_get_pfn(pfn);
1420 			*pfnp = pfn;
1421 		}
1422 
1423 		return true;
1424 	}
1425 
1426 	return false;
1427 }
1428 
1429 /**
1430  * stage2_wp_ptes - write protect PMD range
1431  * @pmd:	pointer to pmd entry
1432  * @addr:	range start address
1433  * @end:	range end address
1434  */
stage2_wp_ptes(pmd_t * pmd,phys_addr_t addr,phys_addr_t end)1435 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1436 {
1437 	pte_t *pte;
1438 
1439 	pte = pte_offset_kernel(pmd, addr);
1440 	do {
1441 		if (!pte_none(*pte)) {
1442 			if (!kvm_s2pte_readonly(pte))
1443 				kvm_set_s2pte_readonly(pte);
1444 		}
1445 	} while (pte++, addr += PAGE_SIZE, addr != end);
1446 }
1447 
1448 /**
1449  * stage2_wp_pmds - write protect PUD range
1450  * kvm:		kvm instance for the VM
1451  * @pud:	pointer to pud entry
1452  * @addr:	range start address
1453  * @end:	range end address
1454  */
stage2_wp_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)1455 static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
1456 			   phys_addr_t addr, phys_addr_t end)
1457 {
1458 	pmd_t *pmd;
1459 	phys_addr_t next;
1460 
1461 	pmd = stage2_pmd_offset(kvm, pud, addr);
1462 
1463 	do {
1464 		next = stage2_pmd_addr_end(kvm, addr, end);
1465 		if (!pmd_none(*pmd)) {
1466 			if (pmd_thp_or_huge(*pmd)) {
1467 				if (!kvm_s2pmd_readonly(pmd))
1468 					kvm_set_s2pmd_readonly(pmd);
1469 			} else {
1470 				stage2_wp_ptes(pmd, addr, next);
1471 			}
1472 		}
1473 	} while (pmd++, addr = next, addr != end);
1474 }
1475 
1476 /**
1477  * stage2_wp_puds - write protect PGD range
1478  * @pgd:	pointer to pgd entry
1479  * @addr:	range start address
1480  * @end:	range end address
1481  */
stage2_wp_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)1482 static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
1483 			    phys_addr_t addr, phys_addr_t end)
1484 {
1485 	pud_t *pud;
1486 	phys_addr_t next;
1487 
1488 	pud = stage2_pud_offset(kvm, pgd, addr);
1489 	do {
1490 		next = stage2_pud_addr_end(kvm, addr, end);
1491 		if (!stage2_pud_none(kvm, *pud)) {
1492 			if (stage2_pud_huge(kvm, *pud)) {
1493 				if (!kvm_s2pud_readonly(pud))
1494 					kvm_set_s2pud_readonly(pud);
1495 			} else {
1496 				stage2_wp_pmds(kvm, pud, addr, next);
1497 			}
1498 		}
1499 	} while (pud++, addr = next, addr != end);
1500 }
1501 
1502 /**
1503  * stage2_wp_range() - write protect stage2 memory region range
1504  * @kvm:	The KVM pointer
1505  * @addr:	Start address of range
1506  * @end:	End address of range
1507  */
stage2_wp_range(struct kvm * kvm,phys_addr_t addr,phys_addr_t end)1508 static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1509 {
1510 	pgd_t *pgd;
1511 	phys_addr_t next;
1512 
1513 	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
1514 	do {
1515 		/*
1516 		 * Release kvm_mmu_lock periodically if the memory region is
1517 		 * large. Otherwise, we may see kernel panics with
1518 		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1519 		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1520 		 * will also starve other vCPUs. We have to also make sure
1521 		 * that the page tables are not freed while we released
1522 		 * the lock.
1523 		 */
1524 		cond_resched_lock(&kvm->mmu_lock);
1525 		if (!READ_ONCE(kvm->arch.pgd))
1526 			break;
1527 		next = stage2_pgd_addr_end(kvm, addr, end);
1528 		if (stage2_pgd_present(kvm, *pgd))
1529 			stage2_wp_puds(kvm, pgd, addr, next);
1530 	} while (pgd++, addr = next, addr != end);
1531 }
1532 
1533 /**
1534  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1535  * @kvm:	The KVM pointer
1536  * @slot:	The memory slot to write protect
1537  *
1538  * Called to start logging dirty pages after memory region
1539  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1540  * all present PUD, PMD and PTEs are write protected in the memory region.
1541  * Afterwards read of dirty page log can be called.
1542  *
1543  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1544  * serializing operations for VM memory regions.
1545  */
kvm_mmu_wp_memory_region(struct kvm * kvm,int slot)1546 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1547 {
1548 	struct kvm_memslots *slots = kvm_memslots(kvm);
1549 	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1550 	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
1551 	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1552 
1553 	spin_lock(&kvm->mmu_lock);
1554 	stage2_wp_range(kvm, start, end);
1555 	spin_unlock(&kvm->mmu_lock);
1556 	kvm_flush_remote_tlbs(kvm);
1557 }
1558 
1559 /**
1560  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1561  * @kvm:	The KVM pointer
1562  * @slot:	The memory slot associated with mask
1563  * @gfn_offset:	The gfn offset in memory slot
1564  * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
1565  *		slot to be write protected
1566  *
1567  * Walks bits set in mask write protects the associated pte's. Caller must
1568  * acquire kvm_mmu_lock.
1569  */
kvm_mmu_write_protect_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)1570 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1571 		struct kvm_memory_slot *slot,
1572 		gfn_t gfn_offset, unsigned long mask)
1573 {
1574 	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1575 	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1576 	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1577 
1578 	stage2_wp_range(kvm, start, end);
1579 }
1580 
1581 /*
1582  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1583  * dirty pages.
1584  *
1585  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1586  * enable dirty logging for them.
1587  */
kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)1588 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1589 		struct kvm_memory_slot *slot,
1590 		gfn_t gfn_offset, unsigned long mask)
1591 {
1592 	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1593 }
1594 
clean_dcache_guest_page(kvm_pfn_t pfn,unsigned long size)1595 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1596 {
1597 	__clean_dcache_guest_page(pfn, size);
1598 }
1599 
invalidate_icache_guest_page(kvm_pfn_t pfn,unsigned long size)1600 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1601 {
1602 	__invalidate_icache_guest_page(pfn, size);
1603 }
1604 
kvm_send_hwpoison_signal(unsigned long address,struct vm_area_struct * vma)1605 static void kvm_send_hwpoison_signal(unsigned long address,
1606 				     struct vm_area_struct *vma)
1607 {
1608 	short lsb;
1609 
1610 	if (is_vm_hugetlb_page(vma))
1611 		lsb = huge_page_shift(hstate_vma(vma));
1612 	else
1613 		lsb = PAGE_SHIFT;
1614 
1615 	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1616 }
1617 
fault_supports_stage2_huge_mapping(struct kvm_memory_slot * memslot,unsigned long hva,unsigned long map_size)1618 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1619 					       unsigned long hva,
1620 					       unsigned long map_size)
1621 {
1622 	gpa_t gpa_start;
1623 	hva_t uaddr_start, uaddr_end;
1624 	size_t size;
1625 
1626 	size = memslot->npages * PAGE_SIZE;
1627 
1628 	gpa_start = memslot->base_gfn << PAGE_SHIFT;
1629 
1630 	uaddr_start = memslot->userspace_addr;
1631 	uaddr_end = uaddr_start + size;
1632 
1633 	/*
1634 	 * Pages belonging to memslots that don't have the same alignment
1635 	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1636 	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1637 	 *
1638 	 * Consider a layout like the following:
1639 	 *
1640 	 *    memslot->userspace_addr:
1641 	 *    +-----+--------------------+--------------------+---+
1642 	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1643 	 *    +-----+--------------------+--------------------+---+
1644 	 *
1645 	 *    memslot->base_gfn << PAGE_SIZE:
1646 	 *      +---+--------------------+--------------------+-----+
1647 	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1648 	 *      +---+--------------------+--------------------+-----+
1649 	 *
1650 	 * If we create those stage-2 blocks, we'll end up with this incorrect
1651 	 * mapping:
1652 	 *   d -> f
1653 	 *   e -> g
1654 	 *   f -> h
1655 	 */
1656 	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1657 		return false;
1658 
1659 	/*
1660 	 * Next, let's make sure we're not trying to map anything not covered
1661 	 * by the memslot. This means we have to prohibit block size mappings
1662 	 * for the beginning and end of a non-block aligned and non-block sized
1663 	 * memory slot (illustrated by the head and tail parts of the
1664 	 * userspace view above containing pages 'abcde' and 'xyz',
1665 	 * respectively).
1666 	 *
1667 	 * Note that it doesn't matter if we do the check using the
1668 	 * userspace_addr or the base_gfn, as both are equally aligned (per
1669 	 * the check above) and equally sized.
1670 	 */
1671 	return (hva & ~(map_size - 1)) >= uaddr_start &&
1672 	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1673 }
1674 
user_mem_abort(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,struct kvm_memory_slot * memslot,unsigned long hva,unsigned long fault_status)1675 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1676 			  struct kvm_memory_slot *memslot, unsigned long hva,
1677 			  unsigned long fault_status)
1678 {
1679 	int ret;
1680 	bool write_fault, writable, force_pte = false;
1681 	bool exec_fault, needs_exec;
1682 	unsigned long mmu_seq;
1683 	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1684 	struct kvm *kvm = vcpu->kvm;
1685 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1686 	struct vm_area_struct *vma;
1687 	kvm_pfn_t pfn;
1688 	pgprot_t mem_type = PAGE_S2;
1689 	bool logging_active = memslot_is_logging(memslot);
1690 	unsigned long vma_pagesize, flags = 0;
1691 
1692 	write_fault = kvm_is_write_fault(vcpu);
1693 	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
1694 	VM_BUG_ON(write_fault && exec_fault);
1695 
1696 	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1697 		kvm_err("Unexpected L2 read permission error\n");
1698 		return -EFAULT;
1699 	}
1700 
1701 	/* Let's check if we will get back a huge page backed by hugetlbfs */
1702 	down_read(&current->mm->mmap_sem);
1703 	vma = find_vma_intersection(current->mm, hva, hva + 1);
1704 	if (unlikely(!vma)) {
1705 		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1706 		up_read(&current->mm->mmap_sem);
1707 		return -EFAULT;
1708 	}
1709 
1710 	vma_pagesize = vma_kernel_pagesize(vma);
1711 	if (logging_active ||
1712 	    (vma->vm_flags & VM_PFNMAP) ||
1713 	    !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1714 		force_pte = true;
1715 		vma_pagesize = PAGE_SIZE;
1716 	}
1717 
1718 	/*
1719 	 * The stage2 has a minimum of 2 level table (For arm64 see
1720 	 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1721 	 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1722 	 * As for PUD huge maps, we must make sure that we have at least
1723 	 * 3 levels, i.e, PMD is not folded.
1724 	 */
1725 	if (vma_pagesize == PMD_SIZE ||
1726 	    (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1727 		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1728 	up_read(&current->mm->mmap_sem);
1729 
1730 	/* We need minimum second+third level pages */
1731 	ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
1732 				     KVM_NR_MEM_OBJS);
1733 	if (ret)
1734 		return ret;
1735 
1736 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
1737 	/*
1738 	 * Ensure the read of mmu_notifier_seq happens before we call
1739 	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1740 	 * the page we just got a reference to gets unmapped before we have a
1741 	 * chance to grab the mmu_lock, which ensure that if the page gets
1742 	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1743 	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1744 	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1745 	 */
1746 	smp_rmb();
1747 
1748 	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1749 	if (pfn == KVM_PFN_ERR_HWPOISON) {
1750 		kvm_send_hwpoison_signal(hva, vma);
1751 		return 0;
1752 	}
1753 	if (is_error_noslot_pfn(pfn))
1754 		return -EFAULT;
1755 
1756 	if (kvm_is_device_pfn(pfn)) {
1757 		mem_type = PAGE_S2_DEVICE;
1758 		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1759 		force_pte = true;
1760 	} else if (logging_active) {
1761 		/*
1762 		 * Faults on pages in a memslot with logging enabled
1763 		 * should not be mapped with huge pages (it introduces churn
1764 		 * and performance degradation), so force a pte mapping.
1765 		 */
1766 		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1767 
1768 		/*
1769 		 * Only actually map the page as writable if this was a write
1770 		 * fault.
1771 		 */
1772 		if (!write_fault)
1773 			writable = false;
1774 	}
1775 
1776 	if (exec_fault && is_iomap(flags))
1777 		return -ENOEXEC;
1778 
1779 	spin_lock(&kvm->mmu_lock);
1780 	if (mmu_notifier_retry(kvm, mmu_seq))
1781 		goto out_unlock;
1782 
1783 	if (vma_pagesize == PAGE_SIZE && !force_pte) {
1784 		/*
1785 		 * Only PMD_SIZE transparent hugepages(THP) are
1786 		 * currently supported. This code will need to be
1787 		 * updated to support other THP sizes.
1788 		 *
1789 		 * Make sure the host VA and the guest IPA are sufficiently
1790 		 * aligned and that the block is contained within the memslot.
1791 		 */
1792 		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
1793 		    transparent_hugepage_adjust(&pfn, &fault_ipa))
1794 			vma_pagesize = PMD_SIZE;
1795 	}
1796 
1797 	if (writable)
1798 		kvm_set_pfn_dirty(pfn);
1799 
1800 	if (fault_status != FSC_PERM && !is_iomap(flags))
1801 		clean_dcache_guest_page(pfn, vma_pagesize);
1802 
1803 	if (exec_fault)
1804 		invalidate_icache_guest_page(pfn, vma_pagesize);
1805 
1806 	/*
1807 	 * If we took an execution fault we have made the
1808 	 * icache/dcache coherent above and should now let the s2
1809 	 * mapping be executable.
1810 	 *
1811 	 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1812 	 * execute permissions, and we preserve whatever we have.
1813 	 */
1814 	needs_exec = exec_fault ||
1815 		(fault_status == FSC_PERM &&
1816 		 stage2_is_exec(kvm, fault_ipa, vma_pagesize));
1817 
1818 	/*
1819 	 * If PUD_SIZE == PMD_SIZE, there is no real PUD level, and
1820 	 * all we have is a 2-level page table. Trying to map a PUD in
1821 	 * this case would be fatally wrong.
1822 	 */
1823 	if (PUD_SIZE != PMD_SIZE && vma_pagesize == PUD_SIZE) {
1824 		pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1825 
1826 		new_pud = kvm_pud_mkhuge(new_pud);
1827 		if (writable)
1828 			new_pud = kvm_s2pud_mkwrite(new_pud);
1829 
1830 		if (needs_exec)
1831 			new_pud = kvm_s2pud_mkexec(new_pud);
1832 
1833 		ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
1834 	} else if (vma_pagesize == PMD_SIZE) {
1835 		pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1836 
1837 		new_pmd = kvm_pmd_mkhuge(new_pmd);
1838 
1839 		if (writable)
1840 			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1841 
1842 		if (needs_exec)
1843 			new_pmd = kvm_s2pmd_mkexec(new_pmd);
1844 
1845 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1846 	} else {
1847 		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1848 
1849 		if (writable) {
1850 			new_pte = kvm_s2pte_mkwrite(new_pte);
1851 			mark_page_dirty(kvm, gfn);
1852 		}
1853 
1854 		if (needs_exec)
1855 			new_pte = kvm_s2pte_mkexec(new_pte);
1856 
1857 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1858 	}
1859 
1860 out_unlock:
1861 	spin_unlock(&kvm->mmu_lock);
1862 	kvm_set_pfn_accessed(pfn);
1863 	kvm_release_pfn_clean(pfn);
1864 	return ret;
1865 }
1866 
1867 /*
1868  * Resolve the access fault by making the page young again.
1869  * Note that because the faulting entry is guaranteed not to be
1870  * cached in the TLB, we don't need to invalidate anything.
1871  * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
1872  * so there is no need for atomic (pte|pmd)_mkyoung operations.
1873  */
handle_access_fault(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa)1874 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1875 {
1876 	pud_t *pud;
1877 	pmd_t *pmd;
1878 	pte_t *pte;
1879 	kvm_pfn_t pfn;
1880 	bool pfn_valid = false;
1881 
1882 	trace_kvm_access_fault(fault_ipa);
1883 
1884 	spin_lock(&vcpu->kvm->mmu_lock);
1885 
1886 	if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
1887 		goto out;
1888 
1889 	if (pud) {		/* HugeTLB */
1890 		*pud = kvm_s2pud_mkyoung(*pud);
1891 		pfn = kvm_pud_pfn(*pud);
1892 		pfn_valid = true;
1893 	} else	if (pmd) {	/* THP, HugeTLB */
1894 		*pmd = pmd_mkyoung(*pmd);
1895 		pfn = pmd_pfn(*pmd);
1896 		pfn_valid = true;
1897 	} else {
1898 		*pte = pte_mkyoung(*pte);	/* Just a page... */
1899 		pfn = pte_pfn(*pte);
1900 		pfn_valid = true;
1901 	}
1902 
1903 out:
1904 	spin_unlock(&vcpu->kvm->mmu_lock);
1905 	if (pfn_valid)
1906 		kvm_set_pfn_accessed(pfn);
1907 }
1908 
1909 /**
1910  * kvm_handle_guest_abort - handles all 2nd stage aborts
1911  * @vcpu:	the VCPU pointer
1912  * @run:	the kvm_run structure
1913  *
1914  * Any abort that gets to the host is almost guaranteed to be caused by a
1915  * missing second stage translation table entry, which can mean that either the
1916  * guest simply needs more memory and we must allocate an appropriate page or it
1917  * can mean that the guest tried to access I/O memory, which is emulated by user
1918  * space. The distinction is based on the IPA causing the fault and whether this
1919  * memory region has been registered as standard RAM by user space.
1920  */
kvm_handle_guest_abort(struct kvm_vcpu * vcpu,struct kvm_run * run)1921 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
1922 {
1923 	unsigned long fault_status;
1924 	phys_addr_t fault_ipa;
1925 	struct kvm_memory_slot *memslot;
1926 	unsigned long hva;
1927 	bool is_iabt, write_fault, writable;
1928 	gfn_t gfn;
1929 	int ret, idx;
1930 
1931 	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1932 
1933 	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1934 	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1935 
1936 	/* Synchronous External Abort? */
1937 	if (kvm_vcpu_dabt_isextabt(vcpu)) {
1938 		/*
1939 		 * For RAS the host kernel may handle this abort.
1940 		 * There is no need to pass the error into the guest.
1941 		 */
1942 		if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
1943 			return 1;
1944 
1945 		if (unlikely(!is_iabt)) {
1946 			kvm_inject_vabt(vcpu);
1947 			return 1;
1948 		}
1949 	}
1950 
1951 	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
1952 			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
1953 
1954 	/* Check the stage-2 fault is trans. fault or write fault */
1955 	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1956 	    fault_status != FSC_ACCESS) {
1957 		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1958 			kvm_vcpu_trap_get_class(vcpu),
1959 			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1960 			(unsigned long)kvm_vcpu_get_hsr(vcpu));
1961 		return -EFAULT;
1962 	}
1963 
1964 	idx = srcu_read_lock(&vcpu->kvm->srcu);
1965 
1966 	gfn = fault_ipa >> PAGE_SHIFT;
1967 	memslot = gfn_to_memslot(vcpu->kvm, gfn);
1968 	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1969 	write_fault = kvm_is_write_fault(vcpu);
1970 	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1971 		if (is_iabt) {
1972 			/* Prefetch Abort on I/O address */
1973 			ret = -ENOEXEC;
1974 			goto out;
1975 		}
1976 
1977 		/*
1978 		 * Check for a cache maintenance operation. Since we
1979 		 * ended-up here, we know it is outside of any memory
1980 		 * slot. But we can't find out if that is for a device,
1981 		 * or if the guest is just being stupid. The only thing
1982 		 * we know for sure is that this range cannot be cached.
1983 		 *
1984 		 * So let's assume that the guest is just being
1985 		 * cautious, and skip the instruction.
1986 		 */
1987 		if (kvm_vcpu_dabt_is_cm(vcpu)) {
1988 			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1989 			ret = 1;
1990 			goto out_unlock;
1991 		}
1992 
1993 		/*
1994 		 * The IPA is reported as [MAX:12], so we need to
1995 		 * complement it with the bottom 12 bits from the
1996 		 * faulting VA. This is always 12 bits, irrespective
1997 		 * of the page size.
1998 		 */
1999 		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
2000 		ret = io_mem_abort(vcpu, run, fault_ipa);
2001 		goto out_unlock;
2002 	}
2003 
2004 	/* Userspace should not be able to register out-of-bounds IPAs */
2005 	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
2006 
2007 	if (fault_status == FSC_ACCESS) {
2008 		handle_access_fault(vcpu, fault_ipa);
2009 		ret = 1;
2010 		goto out_unlock;
2011 	}
2012 
2013 	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
2014 	if (ret == 0)
2015 		ret = 1;
2016 out:
2017 	if (ret == -ENOEXEC) {
2018 		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2019 		ret = 1;
2020 	}
2021 out_unlock:
2022 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
2023 	return ret;
2024 }
2025 
handle_hva_to_gpa(struct kvm * kvm,unsigned long start,unsigned long end,int (* handler)(struct kvm * kvm,gpa_t gpa,u64 size,void * data),void * data)2026 static int handle_hva_to_gpa(struct kvm *kvm,
2027 			     unsigned long start,
2028 			     unsigned long end,
2029 			     int (*handler)(struct kvm *kvm,
2030 					    gpa_t gpa, u64 size,
2031 					    void *data),
2032 			     void *data)
2033 {
2034 	struct kvm_memslots *slots;
2035 	struct kvm_memory_slot *memslot;
2036 	int ret = 0;
2037 
2038 	slots = kvm_memslots(kvm);
2039 
2040 	/* we only care about the pages that the guest sees */
2041 	kvm_for_each_memslot(memslot, slots) {
2042 		unsigned long hva_start, hva_end;
2043 		gfn_t gpa;
2044 
2045 		hva_start = max(start, memslot->userspace_addr);
2046 		hva_end = min(end, memslot->userspace_addr +
2047 					(memslot->npages << PAGE_SHIFT));
2048 		if (hva_start >= hva_end)
2049 			continue;
2050 
2051 		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
2052 		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
2053 	}
2054 
2055 	return ret;
2056 }
2057 
kvm_unmap_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2058 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2059 {
2060 	unsigned flags = *(unsigned *)data;
2061 	bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
2062 
2063 	__unmap_stage2_range(kvm, gpa, size, may_block);
2064 	return 0;
2065 }
2066 
kvm_unmap_hva_range(struct kvm * kvm,unsigned long start,unsigned long end,unsigned flags)2067 int kvm_unmap_hva_range(struct kvm *kvm,
2068 			unsigned long start, unsigned long end, unsigned flags)
2069 {
2070 	if (!kvm->arch.pgd)
2071 		return 0;
2072 
2073 	trace_kvm_unmap_hva_range(start, end);
2074 	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
2075 	return 0;
2076 }
2077 
kvm_set_spte_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2078 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2079 {
2080 	pte_t *pte = (pte_t *)data;
2081 
2082 	WARN_ON(size != PAGE_SIZE);
2083 	/*
2084 	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
2085 	 * flag clear because MMU notifiers will have unmapped a huge PMD before
2086 	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
2087 	 * therefore stage2_set_pte() never needs to clear out a huge PMD
2088 	 * through this calling path.
2089 	 */
2090 	stage2_set_pte(kvm, NULL, gpa, pte, 0);
2091 	return 0;
2092 }
2093 
2094 
kvm_set_spte_hva(struct kvm * kvm,unsigned long hva,pte_t pte)2095 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
2096 {
2097 	unsigned long end = hva + PAGE_SIZE;
2098 	kvm_pfn_t pfn = pte_pfn(pte);
2099 	pte_t stage2_pte;
2100 
2101 	if (!kvm->arch.pgd)
2102 		return 0;
2103 
2104 	trace_kvm_set_spte_hva(hva);
2105 
2106 	/*
2107 	 * We've moved a page around, probably through CoW, so let's treat it
2108 	 * just like a translation fault and clean the cache to the PoC.
2109 	 */
2110 	clean_dcache_guest_page(pfn, PAGE_SIZE);
2111 	stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
2112 	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
2113 
2114 	return 0;
2115 }
2116 
kvm_age_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2117 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2118 {
2119 	pud_t *pud;
2120 	pmd_t *pmd;
2121 	pte_t *pte;
2122 
2123 	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2124 	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
2125 		return 0;
2126 
2127 	if (pud)
2128 		return stage2_pudp_test_and_clear_young(pud);
2129 	else if (pmd)
2130 		return stage2_pmdp_test_and_clear_young(pmd);
2131 	else
2132 		return stage2_ptep_test_and_clear_young(pte);
2133 }
2134 
kvm_test_age_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2135 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2136 {
2137 	pud_t *pud;
2138 	pmd_t *pmd;
2139 	pte_t *pte;
2140 
2141 	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2142 	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
2143 		return 0;
2144 
2145 	if (pud)
2146 		return kvm_s2pud_young(*pud);
2147 	else if (pmd)
2148 		return pmd_young(*pmd);
2149 	else
2150 		return pte_young(*pte);
2151 }
2152 
kvm_age_hva(struct kvm * kvm,unsigned long start,unsigned long end)2153 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2154 {
2155 	if (!kvm->arch.pgd)
2156 		return 0;
2157 	trace_kvm_age_hva(start, end);
2158 	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
2159 }
2160 
kvm_test_age_hva(struct kvm * kvm,unsigned long hva)2161 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2162 {
2163 	if (!kvm->arch.pgd)
2164 		return 0;
2165 	trace_kvm_test_age_hva(hva);
2166 	return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
2167 				 kvm_test_age_hva_handler, NULL);
2168 }
2169 
kvm_mmu_free_memory_caches(struct kvm_vcpu * vcpu)2170 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
2171 {
2172 	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
2173 }
2174 
kvm_mmu_get_httbr(void)2175 phys_addr_t kvm_mmu_get_httbr(void)
2176 {
2177 	if (__kvm_cpu_uses_extended_idmap())
2178 		return virt_to_phys(merged_hyp_pgd);
2179 	else
2180 		return virt_to_phys(hyp_pgd);
2181 }
2182 
kvm_get_idmap_vector(void)2183 phys_addr_t kvm_get_idmap_vector(void)
2184 {
2185 	return hyp_idmap_vector;
2186 }
2187 
kvm_map_idmap_text(pgd_t * pgd)2188 static int kvm_map_idmap_text(pgd_t *pgd)
2189 {
2190 	int err;
2191 
2192 	/* Create the idmap in the boot page tables */
2193 	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
2194 				      hyp_idmap_start, hyp_idmap_end,
2195 				      __phys_to_pfn(hyp_idmap_start),
2196 				      PAGE_HYP_EXEC);
2197 	if (err)
2198 		kvm_err("Failed to idmap %lx-%lx\n",
2199 			hyp_idmap_start, hyp_idmap_end);
2200 
2201 	return err;
2202 }
2203 
kvm_mmu_init(void)2204 int kvm_mmu_init(void)
2205 {
2206 	int err;
2207 
2208 	hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
2209 	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2210 	hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
2211 	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2212 	hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
2213 
2214 	/*
2215 	 * We rely on the linker script to ensure at build time that the HYP
2216 	 * init code does not cross a page boundary.
2217 	 */
2218 	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2219 
2220 	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2221 	kvm_debug("HYP VA range: %lx:%lx\n",
2222 		  kern_hyp_va(PAGE_OFFSET),
2223 		  kern_hyp_va((unsigned long)high_memory - 1));
2224 
2225 	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2226 	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
2227 	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2228 		/*
2229 		 * The idmap page is intersecting with the VA space,
2230 		 * it is not safe to continue further.
2231 		 */
2232 		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2233 		err = -EINVAL;
2234 		goto out;
2235 	}
2236 
2237 	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
2238 	if (!hyp_pgd) {
2239 		kvm_err("Hyp mode PGD not allocated\n");
2240 		err = -ENOMEM;
2241 		goto out;
2242 	}
2243 
2244 	if (__kvm_cpu_uses_extended_idmap()) {
2245 		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
2246 							 hyp_pgd_order);
2247 		if (!boot_hyp_pgd) {
2248 			kvm_err("Hyp boot PGD not allocated\n");
2249 			err = -ENOMEM;
2250 			goto out;
2251 		}
2252 
2253 		err = kvm_map_idmap_text(boot_hyp_pgd);
2254 		if (err)
2255 			goto out;
2256 
2257 		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
2258 		if (!merged_hyp_pgd) {
2259 			kvm_err("Failed to allocate extra HYP pgd\n");
2260 			goto out;
2261 		}
2262 		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
2263 				    hyp_idmap_start);
2264 	} else {
2265 		err = kvm_map_idmap_text(hyp_pgd);
2266 		if (err)
2267 			goto out;
2268 	}
2269 
2270 	io_map_base = hyp_idmap_start;
2271 	return 0;
2272 out:
2273 	free_hyp_pgds();
2274 	return err;
2275 }
2276 
kvm_arch_commit_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region * mem,const struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)2277 void kvm_arch_commit_memory_region(struct kvm *kvm,
2278 				   const struct kvm_userspace_memory_region *mem,
2279 				   const struct kvm_memory_slot *old,
2280 				   const struct kvm_memory_slot *new,
2281 				   enum kvm_mr_change change)
2282 {
2283 	/*
2284 	 * At this point memslot has been committed and there is an
2285 	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
2286 	 * memory slot is write protected.
2287 	 */
2288 	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
2289 		kvm_mmu_wp_memory_region(kvm, mem->slot);
2290 }
2291 
kvm_arch_prepare_memory_region(struct kvm * kvm,struct kvm_memory_slot * memslot,const struct kvm_userspace_memory_region * mem,enum kvm_mr_change change)2292 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2293 				   struct kvm_memory_slot *memslot,
2294 				   const struct kvm_userspace_memory_region *mem,
2295 				   enum kvm_mr_change change)
2296 {
2297 	hva_t hva = mem->userspace_addr;
2298 	hva_t reg_end = hva + mem->memory_size;
2299 	bool writable = !(mem->flags & KVM_MEM_READONLY);
2300 	int ret = 0;
2301 
2302 	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2303 			change != KVM_MR_FLAGS_ONLY)
2304 		return 0;
2305 
2306 	/*
2307 	 * Prevent userspace from creating a memory region outside of the IPA
2308 	 * space addressable by the KVM guest IPA space.
2309 	 */
2310 	if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
2311 		return -EFAULT;
2312 
2313 	down_read(&current->mm->mmap_sem);
2314 	/*
2315 	 * A memory region could potentially cover multiple VMAs, and any holes
2316 	 * between them, so iterate over all of them to find out if we can map
2317 	 * any of them right now.
2318 	 *
2319 	 *     +--------------------------------------------+
2320 	 * +---------------+----------------+   +----------------+
2321 	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2322 	 * +---------------+----------------+   +----------------+
2323 	 *     |               memory region                |
2324 	 *     +--------------------------------------------+
2325 	 */
2326 	do {
2327 		struct vm_area_struct *vma = find_vma(current->mm, hva);
2328 		hva_t vm_start, vm_end;
2329 
2330 		if (!vma || vma->vm_start >= reg_end)
2331 			break;
2332 
2333 		/*
2334 		 * Mapping a read-only VMA is only allowed if the
2335 		 * memory region is configured as read-only.
2336 		 */
2337 		if (writable && !(vma->vm_flags & VM_WRITE)) {
2338 			ret = -EPERM;
2339 			break;
2340 		}
2341 
2342 		/*
2343 		 * Take the intersection of this VMA with the memory region
2344 		 */
2345 		vm_start = max(hva, vma->vm_start);
2346 		vm_end = min(reg_end, vma->vm_end);
2347 
2348 		if (vma->vm_flags & VM_PFNMAP) {
2349 			gpa_t gpa = mem->guest_phys_addr +
2350 				    (vm_start - mem->userspace_addr);
2351 			phys_addr_t pa;
2352 
2353 			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2354 			pa += vm_start - vma->vm_start;
2355 
2356 			/* IO region dirty page logging not allowed */
2357 			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2358 				ret = -EINVAL;
2359 				goto out;
2360 			}
2361 
2362 			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2363 						    vm_end - vm_start,
2364 						    writable);
2365 			if (ret)
2366 				break;
2367 		}
2368 		hva = vm_end;
2369 	} while (hva < reg_end);
2370 
2371 	if (change == KVM_MR_FLAGS_ONLY)
2372 		goto out;
2373 
2374 	spin_lock(&kvm->mmu_lock);
2375 	if (ret)
2376 		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
2377 	else
2378 		stage2_flush_memslot(kvm, memslot);
2379 	spin_unlock(&kvm->mmu_lock);
2380 out:
2381 	up_read(&current->mm->mmap_sem);
2382 	return ret;
2383 }
2384 
kvm_arch_free_memslot(struct kvm * kvm,struct kvm_memory_slot * free,struct kvm_memory_slot * dont)2385 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
2386 			   struct kvm_memory_slot *dont)
2387 {
2388 }
2389 
kvm_arch_create_memslot(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned long npages)2390 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
2391 			    unsigned long npages)
2392 {
2393 	return 0;
2394 }
2395 
kvm_arch_memslots_updated(struct kvm * kvm,u64 gen)2396 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2397 {
2398 }
2399 
kvm_arch_flush_shadow_all(struct kvm * kvm)2400 void kvm_arch_flush_shadow_all(struct kvm *kvm)
2401 {
2402 	kvm_free_stage2_pgd(kvm);
2403 }
2404 
kvm_arch_flush_shadow_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)2405 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2406 				   struct kvm_memory_slot *slot)
2407 {
2408 	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2409 	phys_addr_t size = slot->npages << PAGE_SHIFT;
2410 
2411 	spin_lock(&kvm->mmu_lock);
2412 	unmap_stage2_range(kvm, gpa, size);
2413 	spin_unlock(&kvm->mmu_lock);
2414 }
2415 
2416 /*
2417  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2418  *
2419  * Main problems:
2420  * - S/W ops are local to a CPU (not broadcast)
2421  * - We have line migration behind our back (speculation)
2422  * - System caches don't support S/W at all (damn!)
2423  *
2424  * In the face of the above, the best we can do is to try and convert
2425  * S/W ops to VA ops. Because the guest is not allowed to infer the
2426  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2427  * which is a rather good thing for us.
2428  *
2429  * Also, it is only used when turning caches on/off ("The expected
2430  * usage of the cache maintenance instructions that operate by set/way
2431  * is associated with the cache maintenance instructions associated
2432  * with the powerdown and powerup of caches, if this is required by
2433  * the implementation.").
2434  *
2435  * We use the following policy:
2436  *
2437  * - If we trap a S/W operation, we enable VM trapping to detect
2438  *   caches being turned on/off, and do a full clean.
2439  *
2440  * - We flush the caches on both caches being turned on and off.
2441  *
2442  * - Once the caches are enabled, we stop trapping VM ops.
2443  */
kvm_set_way_flush(struct kvm_vcpu * vcpu)2444 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2445 {
2446 	unsigned long hcr = *vcpu_hcr(vcpu);
2447 
2448 	/*
2449 	 * If this is the first time we do a S/W operation
2450 	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2451 	 * VM trapping.
2452 	 *
2453 	 * Otherwise, rely on the VM trapping to wait for the MMU +
2454 	 * Caches to be turned off. At that point, we'll be able to
2455 	 * clean the caches again.
2456 	 */
2457 	if (!(hcr & HCR_TVM)) {
2458 		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2459 					vcpu_has_cache_enabled(vcpu));
2460 		stage2_flush_vm(vcpu->kvm);
2461 		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
2462 	}
2463 }
2464 
kvm_toggle_cache(struct kvm_vcpu * vcpu,bool was_enabled)2465 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2466 {
2467 	bool now_enabled = vcpu_has_cache_enabled(vcpu);
2468 
2469 	/*
2470 	 * If switching the MMU+caches on, need to invalidate the caches.
2471 	 * If switching it off, need to clean the caches.
2472 	 * Clean + invalidate does the trick always.
2473 	 */
2474 	if (now_enabled != was_enabled)
2475 		stage2_flush_vm(vcpu->kvm);
2476 
2477 	/* Caches are now on, stop trapping VM ops (until a S/W op) */
2478 	if (now_enabled)
2479 		*vcpu_hcr(vcpu) &= ~HCR_TVM;
2480 
2481 	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2482 }
2483