1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5 */
6
7 #include <linux/mman.h>
8 #include <linux/kvm_host.h>
9 #include <linux/io.h>
10 #include <linux/hugetlb.h>
11 #include <linux/sched/signal.h>
12 #include <trace/events/kvm.h>
13 #include <asm/pgalloc.h>
14 #include <asm/cacheflush.h>
15 #include <asm/kvm_arm.h>
16 #include <asm/kvm_mmu.h>
17 #include <asm/kvm_mmio.h>
18 #include <asm/kvm_ras.h>
19 #include <asm/kvm_asm.h>
20 #include <asm/kvm_emulate.h>
21 #include <asm/virt.h>
22
23 #include "trace.h"
24
25 static pgd_t *boot_hyp_pgd;
26 static pgd_t *hyp_pgd;
27 static pgd_t *merged_hyp_pgd;
28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
29
30 static unsigned long hyp_idmap_start;
31 static unsigned long hyp_idmap_end;
32 static phys_addr_t hyp_idmap_vector;
33
34 static unsigned long io_map_base;
35
36 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
37
38 #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
39 #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
40
is_iomap(unsigned long flags)41 static bool is_iomap(unsigned long flags)
42 {
43 return flags & KVM_S2PTE_FLAG_IS_IOMAP;
44 }
45
memslot_is_logging(struct kvm_memory_slot * memslot)46 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
47 {
48 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
49 }
50
51 /**
52 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
53 * @kvm: pointer to kvm structure.
54 *
55 * Interface to HYP function to flush all VM TLB entries
56 */
kvm_flush_remote_tlbs(struct kvm * kvm)57 void kvm_flush_remote_tlbs(struct kvm *kvm)
58 {
59 kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
60 }
61
kvm_tlb_flush_vmid_ipa(struct kvm * kvm,phys_addr_t ipa)62 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
63 {
64 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
65 }
66
67 /*
68 * D-Cache management functions. They take the page table entries by
69 * value, as they are flushing the cache using the kernel mapping (or
70 * kmap on 32bit).
71 */
kvm_flush_dcache_pte(pte_t pte)72 static void kvm_flush_dcache_pte(pte_t pte)
73 {
74 __kvm_flush_dcache_pte(pte);
75 }
76
kvm_flush_dcache_pmd(pmd_t pmd)77 static void kvm_flush_dcache_pmd(pmd_t pmd)
78 {
79 __kvm_flush_dcache_pmd(pmd);
80 }
81
kvm_flush_dcache_pud(pud_t pud)82 static void kvm_flush_dcache_pud(pud_t pud)
83 {
84 __kvm_flush_dcache_pud(pud);
85 }
86
kvm_is_device_pfn(unsigned long pfn)87 static bool kvm_is_device_pfn(unsigned long pfn)
88 {
89 return !pfn_valid(pfn);
90 }
91
92 /**
93 * stage2_dissolve_pmd() - clear and flush huge PMD entry
94 * @kvm: pointer to kvm structure.
95 * @addr: IPA
96 * @pmd: pmd pointer for IPA
97 *
98 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
99 */
stage2_dissolve_pmd(struct kvm * kvm,phys_addr_t addr,pmd_t * pmd)100 static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
101 {
102 if (!pmd_thp_or_huge(*pmd))
103 return;
104
105 pmd_clear(pmd);
106 kvm_tlb_flush_vmid_ipa(kvm, addr);
107 put_page(virt_to_page(pmd));
108 }
109
110 /**
111 * stage2_dissolve_pud() - clear and flush huge PUD entry
112 * @kvm: pointer to kvm structure.
113 * @addr: IPA
114 * @pud: pud pointer for IPA
115 *
116 * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
117 */
stage2_dissolve_pud(struct kvm * kvm,phys_addr_t addr,pud_t * pudp)118 static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
119 {
120 if (!stage2_pud_huge(kvm, *pudp))
121 return;
122
123 stage2_pud_clear(kvm, pudp);
124 kvm_tlb_flush_vmid_ipa(kvm, addr);
125 put_page(virt_to_page(pudp));
126 }
127
mmu_topup_memory_cache(struct kvm_mmu_memory_cache * cache,int min,int max)128 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
129 int min, int max)
130 {
131 void *page;
132
133 BUG_ON(max > KVM_NR_MEM_OBJS);
134 if (cache->nobjs >= min)
135 return 0;
136 while (cache->nobjs < max) {
137 page = (void *)__get_free_page(GFP_PGTABLE_USER);
138 if (!page)
139 return -ENOMEM;
140 cache->objects[cache->nobjs++] = page;
141 }
142 return 0;
143 }
144
mmu_free_memory_cache(struct kvm_mmu_memory_cache * mc)145 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
146 {
147 while (mc->nobjs)
148 free_page((unsigned long)mc->objects[--mc->nobjs]);
149 }
150
mmu_memory_cache_alloc(struct kvm_mmu_memory_cache * mc)151 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
152 {
153 void *p;
154
155 BUG_ON(!mc || !mc->nobjs);
156 p = mc->objects[--mc->nobjs];
157 return p;
158 }
159
clear_stage2_pgd_entry(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr)160 static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
161 {
162 pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
163 stage2_pgd_clear(kvm, pgd);
164 kvm_tlb_flush_vmid_ipa(kvm, addr);
165 stage2_pud_free(kvm, pud_table);
166 put_page(virt_to_page(pgd));
167 }
168
clear_stage2_pud_entry(struct kvm * kvm,pud_t * pud,phys_addr_t addr)169 static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
170 {
171 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
172 VM_BUG_ON(stage2_pud_huge(kvm, *pud));
173 stage2_pud_clear(kvm, pud);
174 kvm_tlb_flush_vmid_ipa(kvm, addr);
175 stage2_pmd_free(kvm, pmd_table);
176 put_page(virt_to_page(pud));
177 }
178
clear_stage2_pmd_entry(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr)179 static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
180 {
181 pte_t *pte_table = pte_offset_kernel(pmd, 0);
182 VM_BUG_ON(pmd_thp_or_huge(*pmd));
183 pmd_clear(pmd);
184 kvm_tlb_flush_vmid_ipa(kvm, addr);
185 free_page((unsigned long)pte_table);
186 put_page(virt_to_page(pmd));
187 }
188
kvm_set_pte(pte_t * ptep,pte_t new_pte)189 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
190 {
191 WRITE_ONCE(*ptep, new_pte);
192 dsb(ishst);
193 }
194
kvm_set_pmd(pmd_t * pmdp,pmd_t new_pmd)195 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
196 {
197 WRITE_ONCE(*pmdp, new_pmd);
198 dsb(ishst);
199 }
200
kvm_pmd_populate(pmd_t * pmdp,pte_t * ptep)201 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
202 {
203 kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
204 }
205
kvm_pud_populate(pud_t * pudp,pmd_t * pmdp)206 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
207 {
208 WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
209 dsb(ishst);
210 }
211
kvm_pgd_populate(pgd_t * pgdp,pud_t * pudp)212 static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp)
213 {
214 WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp));
215 dsb(ishst);
216 }
217
218 /*
219 * Unmapping vs dcache management:
220 *
221 * If a guest maps certain memory pages as uncached, all writes will
222 * bypass the data cache and go directly to RAM. However, the CPUs
223 * can still speculate reads (not writes) and fill cache lines with
224 * data.
225 *
226 * Those cache lines will be *clean* cache lines though, so a
227 * clean+invalidate operation is equivalent to an invalidate
228 * operation, because no cache lines are marked dirty.
229 *
230 * Those clean cache lines could be filled prior to an uncached write
231 * by the guest, and the cache coherent IO subsystem would therefore
232 * end up writing old data to disk.
233 *
234 * This is why right after unmapping a page/section and invalidating
235 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
236 * the IO subsystem will never hit in the cache.
237 *
238 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
239 * we then fully enforce cacheability of RAM, no matter what the guest
240 * does.
241 */
unmap_stage2_ptes(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr,phys_addr_t end)242 static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
243 phys_addr_t addr, phys_addr_t end)
244 {
245 phys_addr_t start_addr = addr;
246 pte_t *pte, *start_pte;
247
248 start_pte = pte = pte_offset_kernel(pmd, addr);
249 do {
250 if (!pte_none(*pte)) {
251 pte_t old_pte = *pte;
252
253 kvm_set_pte(pte, __pte(0));
254 kvm_tlb_flush_vmid_ipa(kvm, addr);
255
256 /* No need to invalidate the cache for device mappings */
257 if (!kvm_is_device_pfn(pte_pfn(old_pte)))
258 kvm_flush_dcache_pte(old_pte);
259
260 put_page(virt_to_page(pte));
261 }
262 } while (pte++, addr += PAGE_SIZE, addr != end);
263
264 if (stage2_pte_table_empty(kvm, start_pte))
265 clear_stage2_pmd_entry(kvm, pmd, start_addr);
266 }
267
unmap_stage2_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)268 static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
269 phys_addr_t addr, phys_addr_t end)
270 {
271 phys_addr_t next, start_addr = addr;
272 pmd_t *pmd, *start_pmd;
273
274 start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
275 do {
276 next = stage2_pmd_addr_end(kvm, addr, end);
277 if (!pmd_none(*pmd)) {
278 if (pmd_thp_or_huge(*pmd)) {
279 pmd_t old_pmd = *pmd;
280
281 pmd_clear(pmd);
282 kvm_tlb_flush_vmid_ipa(kvm, addr);
283
284 kvm_flush_dcache_pmd(old_pmd);
285
286 put_page(virt_to_page(pmd));
287 } else {
288 unmap_stage2_ptes(kvm, pmd, addr, next);
289 }
290 }
291 } while (pmd++, addr = next, addr != end);
292
293 if (stage2_pmd_table_empty(kvm, start_pmd))
294 clear_stage2_pud_entry(kvm, pud, start_addr);
295 }
296
unmap_stage2_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)297 static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
298 phys_addr_t addr, phys_addr_t end)
299 {
300 phys_addr_t next, start_addr = addr;
301 pud_t *pud, *start_pud;
302
303 start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
304 do {
305 next = stage2_pud_addr_end(kvm, addr, end);
306 if (!stage2_pud_none(kvm, *pud)) {
307 if (stage2_pud_huge(kvm, *pud)) {
308 pud_t old_pud = *pud;
309
310 stage2_pud_clear(kvm, pud);
311 kvm_tlb_flush_vmid_ipa(kvm, addr);
312 kvm_flush_dcache_pud(old_pud);
313 put_page(virt_to_page(pud));
314 } else {
315 unmap_stage2_pmds(kvm, pud, addr, next);
316 }
317 }
318 } while (pud++, addr = next, addr != end);
319
320 if (stage2_pud_table_empty(kvm, start_pud))
321 clear_stage2_pgd_entry(kvm, pgd, start_addr);
322 }
323
324 /**
325 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
326 * @kvm: The VM pointer
327 * @start: The intermediate physical base address of the range to unmap
328 * @size: The size of the area to unmap
329 *
330 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
331 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
332 * destroying the VM), otherwise another faulting VCPU may come in and mess
333 * with things behind our backs.
334 */
__unmap_stage2_range(struct kvm * kvm,phys_addr_t start,u64 size,bool may_block)335 static void __unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size,
336 bool may_block)
337 {
338 pgd_t *pgd;
339 phys_addr_t addr = start, end = start + size;
340 phys_addr_t next;
341
342 assert_spin_locked(&kvm->mmu_lock);
343 WARN_ON(size & ~PAGE_MASK);
344
345 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
346 do {
347 /*
348 * Make sure the page table is still active, as another thread
349 * could have possibly freed the page table, while we released
350 * the lock.
351 */
352 if (!READ_ONCE(kvm->arch.pgd))
353 break;
354 next = stage2_pgd_addr_end(kvm, addr, end);
355 if (!stage2_pgd_none(kvm, *pgd))
356 unmap_stage2_puds(kvm, pgd, addr, next);
357 /*
358 * If the range is too large, release the kvm->mmu_lock
359 * to prevent starvation and lockup detector warnings.
360 */
361 if (may_block && next != end)
362 cond_resched_lock(&kvm->mmu_lock);
363 } while (pgd++, addr = next, addr != end);
364 }
365
unmap_stage2_range(struct kvm * kvm,phys_addr_t start,u64 size)366 static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
367 {
368 __unmap_stage2_range(kvm, start, size, true);
369 }
370
stage2_flush_ptes(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr,phys_addr_t end)371 static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
372 phys_addr_t addr, phys_addr_t end)
373 {
374 pte_t *pte;
375
376 pte = pte_offset_kernel(pmd, addr);
377 do {
378 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
379 kvm_flush_dcache_pte(*pte);
380 } while (pte++, addr += PAGE_SIZE, addr != end);
381 }
382
stage2_flush_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)383 static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
384 phys_addr_t addr, phys_addr_t end)
385 {
386 pmd_t *pmd;
387 phys_addr_t next;
388
389 pmd = stage2_pmd_offset(kvm, pud, addr);
390 do {
391 next = stage2_pmd_addr_end(kvm, addr, end);
392 if (!pmd_none(*pmd)) {
393 if (pmd_thp_or_huge(*pmd))
394 kvm_flush_dcache_pmd(*pmd);
395 else
396 stage2_flush_ptes(kvm, pmd, addr, next);
397 }
398 } while (pmd++, addr = next, addr != end);
399 }
400
stage2_flush_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)401 static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
402 phys_addr_t addr, phys_addr_t end)
403 {
404 pud_t *pud;
405 phys_addr_t next;
406
407 pud = stage2_pud_offset(kvm, pgd, addr);
408 do {
409 next = stage2_pud_addr_end(kvm, addr, end);
410 if (!stage2_pud_none(kvm, *pud)) {
411 if (stage2_pud_huge(kvm, *pud))
412 kvm_flush_dcache_pud(*pud);
413 else
414 stage2_flush_pmds(kvm, pud, addr, next);
415 }
416 } while (pud++, addr = next, addr != end);
417 }
418
stage2_flush_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)419 static void stage2_flush_memslot(struct kvm *kvm,
420 struct kvm_memory_slot *memslot)
421 {
422 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
423 phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
424 phys_addr_t next;
425 pgd_t *pgd;
426
427 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
428 do {
429 next = stage2_pgd_addr_end(kvm, addr, end);
430 if (!stage2_pgd_none(kvm, *pgd))
431 stage2_flush_puds(kvm, pgd, addr, next);
432 } while (pgd++, addr = next, addr != end);
433 }
434
435 /**
436 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
437 * @kvm: The struct kvm pointer
438 *
439 * Go through the stage 2 page tables and invalidate any cache lines
440 * backing memory already mapped to the VM.
441 */
stage2_flush_vm(struct kvm * kvm)442 static void stage2_flush_vm(struct kvm *kvm)
443 {
444 struct kvm_memslots *slots;
445 struct kvm_memory_slot *memslot;
446 int idx;
447
448 idx = srcu_read_lock(&kvm->srcu);
449 spin_lock(&kvm->mmu_lock);
450
451 slots = kvm_memslots(kvm);
452 kvm_for_each_memslot(memslot, slots)
453 stage2_flush_memslot(kvm, memslot);
454
455 spin_unlock(&kvm->mmu_lock);
456 srcu_read_unlock(&kvm->srcu, idx);
457 }
458
clear_hyp_pgd_entry(pgd_t * pgd)459 static void clear_hyp_pgd_entry(pgd_t *pgd)
460 {
461 pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
462 pgd_clear(pgd);
463 pud_free(NULL, pud_table);
464 put_page(virt_to_page(pgd));
465 }
466
clear_hyp_pud_entry(pud_t * pud)467 static void clear_hyp_pud_entry(pud_t *pud)
468 {
469 pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
470 VM_BUG_ON(pud_huge(*pud));
471 pud_clear(pud);
472 pmd_free(NULL, pmd_table);
473 put_page(virt_to_page(pud));
474 }
475
clear_hyp_pmd_entry(pmd_t * pmd)476 static void clear_hyp_pmd_entry(pmd_t *pmd)
477 {
478 pte_t *pte_table = pte_offset_kernel(pmd, 0);
479 VM_BUG_ON(pmd_thp_or_huge(*pmd));
480 pmd_clear(pmd);
481 pte_free_kernel(NULL, pte_table);
482 put_page(virt_to_page(pmd));
483 }
484
unmap_hyp_ptes(pmd_t * pmd,phys_addr_t addr,phys_addr_t end)485 static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
486 {
487 pte_t *pte, *start_pte;
488
489 start_pte = pte = pte_offset_kernel(pmd, addr);
490 do {
491 if (!pte_none(*pte)) {
492 kvm_set_pte(pte, __pte(0));
493 put_page(virt_to_page(pte));
494 }
495 } while (pte++, addr += PAGE_SIZE, addr != end);
496
497 if (hyp_pte_table_empty(start_pte))
498 clear_hyp_pmd_entry(pmd);
499 }
500
unmap_hyp_pmds(pud_t * pud,phys_addr_t addr,phys_addr_t end)501 static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
502 {
503 phys_addr_t next;
504 pmd_t *pmd, *start_pmd;
505
506 start_pmd = pmd = pmd_offset(pud, addr);
507 do {
508 next = pmd_addr_end(addr, end);
509 /* Hyp doesn't use huge pmds */
510 if (!pmd_none(*pmd))
511 unmap_hyp_ptes(pmd, addr, next);
512 } while (pmd++, addr = next, addr != end);
513
514 if (hyp_pmd_table_empty(start_pmd))
515 clear_hyp_pud_entry(pud);
516 }
517
unmap_hyp_puds(pgd_t * pgd,phys_addr_t addr,phys_addr_t end)518 static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
519 {
520 phys_addr_t next;
521 pud_t *pud, *start_pud;
522
523 start_pud = pud = pud_offset(pgd, addr);
524 do {
525 next = pud_addr_end(addr, end);
526 /* Hyp doesn't use huge puds */
527 if (!pud_none(*pud))
528 unmap_hyp_pmds(pud, addr, next);
529 } while (pud++, addr = next, addr != end);
530
531 if (hyp_pud_table_empty(start_pud))
532 clear_hyp_pgd_entry(pgd);
533 }
534
kvm_pgd_index(unsigned long addr,unsigned int ptrs_per_pgd)535 static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
536 {
537 return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
538 }
539
__unmap_hyp_range(pgd_t * pgdp,unsigned long ptrs_per_pgd,phys_addr_t start,u64 size)540 static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
541 phys_addr_t start, u64 size)
542 {
543 pgd_t *pgd;
544 phys_addr_t addr = start, end = start + size;
545 phys_addr_t next;
546
547 /*
548 * We don't unmap anything from HYP, except at the hyp tear down.
549 * Hence, we don't have to invalidate the TLBs here.
550 */
551 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
552 do {
553 next = pgd_addr_end(addr, end);
554 if (!pgd_none(*pgd))
555 unmap_hyp_puds(pgd, addr, next);
556 } while (pgd++, addr = next, addr != end);
557 }
558
unmap_hyp_range(pgd_t * pgdp,phys_addr_t start,u64 size)559 static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
560 {
561 __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
562 }
563
unmap_hyp_idmap_range(pgd_t * pgdp,phys_addr_t start,u64 size)564 static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
565 {
566 __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
567 }
568
569 /**
570 * free_hyp_pgds - free Hyp-mode page tables
571 *
572 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
573 * therefore contains either mappings in the kernel memory area (above
574 * PAGE_OFFSET), or device mappings in the idmap range.
575 *
576 * boot_hyp_pgd should only map the idmap range, and is only used in
577 * the extended idmap case.
578 */
free_hyp_pgds(void)579 void free_hyp_pgds(void)
580 {
581 pgd_t *id_pgd;
582
583 mutex_lock(&kvm_hyp_pgd_mutex);
584
585 id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
586
587 if (id_pgd) {
588 /* In case we never called hyp_mmu_init() */
589 if (!io_map_base)
590 io_map_base = hyp_idmap_start;
591 unmap_hyp_idmap_range(id_pgd, io_map_base,
592 hyp_idmap_start + PAGE_SIZE - io_map_base);
593 }
594
595 if (boot_hyp_pgd) {
596 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
597 boot_hyp_pgd = NULL;
598 }
599
600 if (hyp_pgd) {
601 unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
602 (uintptr_t)high_memory - PAGE_OFFSET);
603
604 free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
605 hyp_pgd = NULL;
606 }
607 if (merged_hyp_pgd) {
608 clear_page(merged_hyp_pgd);
609 free_page((unsigned long)merged_hyp_pgd);
610 merged_hyp_pgd = NULL;
611 }
612
613 mutex_unlock(&kvm_hyp_pgd_mutex);
614 }
615
create_hyp_pte_mappings(pmd_t * pmd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)616 static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
617 unsigned long end, unsigned long pfn,
618 pgprot_t prot)
619 {
620 pte_t *pte;
621 unsigned long addr;
622
623 addr = start;
624 do {
625 pte = pte_offset_kernel(pmd, addr);
626 kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
627 get_page(virt_to_page(pte));
628 pfn++;
629 } while (addr += PAGE_SIZE, addr != end);
630 }
631
create_hyp_pmd_mappings(pud_t * pud,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)632 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
633 unsigned long end, unsigned long pfn,
634 pgprot_t prot)
635 {
636 pmd_t *pmd;
637 pte_t *pte;
638 unsigned long addr, next;
639
640 addr = start;
641 do {
642 pmd = pmd_offset(pud, addr);
643
644 BUG_ON(pmd_sect(*pmd));
645
646 if (pmd_none(*pmd)) {
647 pte = pte_alloc_one_kernel(NULL);
648 if (!pte) {
649 kvm_err("Cannot allocate Hyp pte\n");
650 return -ENOMEM;
651 }
652 kvm_pmd_populate(pmd, pte);
653 get_page(virt_to_page(pmd));
654 }
655
656 next = pmd_addr_end(addr, end);
657
658 create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
659 pfn += (next - addr) >> PAGE_SHIFT;
660 } while (addr = next, addr != end);
661
662 return 0;
663 }
664
create_hyp_pud_mappings(pgd_t * pgd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)665 static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
666 unsigned long end, unsigned long pfn,
667 pgprot_t prot)
668 {
669 pud_t *pud;
670 pmd_t *pmd;
671 unsigned long addr, next;
672 int ret;
673
674 addr = start;
675 do {
676 pud = pud_offset(pgd, addr);
677
678 if (pud_none_or_clear_bad(pud)) {
679 pmd = pmd_alloc_one(NULL, addr);
680 if (!pmd) {
681 kvm_err("Cannot allocate Hyp pmd\n");
682 return -ENOMEM;
683 }
684 kvm_pud_populate(pud, pmd);
685 get_page(virt_to_page(pud));
686 }
687
688 next = pud_addr_end(addr, end);
689 ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
690 if (ret)
691 return ret;
692 pfn += (next - addr) >> PAGE_SHIFT;
693 } while (addr = next, addr != end);
694
695 return 0;
696 }
697
__create_hyp_mappings(pgd_t * pgdp,unsigned long ptrs_per_pgd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)698 static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
699 unsigned long start, unsigned long end,
700 unsigned long pfn, pgprot_t prot)
701 {
702 pgd_t *pgd;
703 pud_t *pud;
704 unsigned long addr, next;
705 int err = 0;
706
707 mutex_lock(&kvm_hyp_pgd_mutex);
708 addr = start & PAGE_MASK;
709 end = PAGE_ALIGN(end);
710 do {
711 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
712
713 if (pgd_none(*pgd)) {
714 pud = pud_alloc_one(NULL, addr);
715 if (!pud) {
716 kvm_err("Cannot allocate Hyp pud\n");
717 err = -ENOMEM;
718 goto out;
719 }
720 kvm_pgd_populate(pgd, pud);
721 get_page(virt_to_page(pgd));
722 }
723
724 next = pgd_addr_end(addr, end);
725 err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
726 if (err)
727 goto out;
728 pfn += (next - addr) >> PAGE_SHIFT;
729 } while (addr = next, addr != end);
730 out:
731 mutex_unlock(&kvm_hyp_pgd_mutex);
732 return err;
733 }
734
kvm_kaddr_to_phys(void * kaddr)735 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
736 {
737 if (!is_vmalloc_addr(kaddr)) {
738 BUG_ON(!virt_addr_valid(kaddr));
739 return __pa(kaddr);
740 } else {
741 return page_to_phys(vmalloc_to_page(kaddr)) +
742 offset_in_page(kaddr);
743 }
744 }
745
746 /**
747 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
748 * @from: The virtual kernel start address of the range
749 * @to: The virtual kernel end address of the range (exclusive)
750 * @prot: The protection to be applied to this range
751 *
752 * The same virtual address as the kernel virtual address is also used
753 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
754 * physical pages.
755 */
create_hyp_mappings(void * from,void * to,pgprot_t prot)756 int create_hyp_mappings(void *from, void *to, pgprot_t prot)
757 {
758 phys_addr_t phys_addr;
759 unsigned long virt_addr;
760 unsigned long start = kern_hyp_va((unsigned long)from);
761 unsigned long end = kern_hyp_va((unsigned long)to);
762
763 if (is_kernel_in_hyp_mode())
764 return 0;
765
766 start = start & PAGE_MASK;
767 end = PAGE_ALIGN(end);
768
769 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
770 int err;
771
772 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
773 err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
774 virt_addr, virt_addr + PAGE_SIZE,
775 __phys_to_pfn(phys_addr),
776 prot);
777 if (err)
778 return err;
779 }
780
781 return 0;
782 }
783
__create_hyp_private_mapping(phys_addr_t phys_addr,size_t size,unsigned long * haddr,pgprot_t prot)784 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
785 unsigned long *haddr, pgprot_t prot)
786 {
787 pgd_t *pgd = hyp_pgd;
788 unsigned long base;
789 int ret = 0;
790
791 mutex_lock(&kvm_hyp_pgd_mutex);
792
793 /*
794 * This assumes that we we have enough space below the idmap
795 * page to allocate our VAs. If not, the check below will
796 * kick. A potential alternative would be to detect that
797 * overflow and switch to an allocation above the idmap.
798 *
799 * The allocated size is always a multiple of PAGE_SIZE.
800 */
801 size = PAGE_ALIGN(size + offset_in_page(phys_addr));
802 base = io_map_base - size;
803
804 /*
805 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
806 * allocating the new area, as it would indicate we've
807 * overflowed the idmap/IO address range.
808 */
809 if ((base ^ io_map_base) & BIT(VA_BITS - 1))
810 ret = -ENOMEM;
811 else
812 io_map_base = base;
813
814 mutex_unlock(&kvm_hyp_pgd_mutex);
815
816 if (ret)
817 goto out;
818
819 if (__kvm_cpu_uses_extended_idmap())
820 pgd = boot_hyp_pgd;
821
822 ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
823 base, base + size,
824 __phys_to_pfn(phys_addr), prot);
825 if (ret)
826 goto out;
827
828 *haddr = base + offset_in_page(phys_addr);
829
830 out:
831 return ret;
832 }
833
834 /**
835 * create_hyp_io_mappings - Map IO into both kernel and HYP
836 * @phys_addr: The physical start address which gets mapped
837 * @size: Size of the region being mapped
838 * @kaddr: Kernel VA for this mapping
839 * @haddr: HYP VA for this mapping
840 */
create_hyp_io_mappings(phys_addr_t phys_addr,size_t size,void __iomem ** kaddr,void __iomem ** haddr)841 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
842 void __iomem **kaddr,
843 void __iomem **haddr)
844 {
845 unsigned long addr;
846 int ret;
847
848 *kaddr = ioremap(phys_addr, size);
849 if (!*kaddr)
850 return -ENOMEM;
851
852 if (is_kernel_in_hyp_mode()) {
853 *haddr = *kaddr;
854 return 0;
855 }
856
857 ret = __create_hyp_private_mapping(phys_addr, size,
858 &addr, PAGE_HYP_DEVICE);
859 if (ret) {
860 iounmap(*kaddr);
861 *kaddr = NULL;
862 *haddr = NULL;
863 return ret;
864 }
865
866 *haddr = (void __iomem *)addr;
867 return 0;
868 }
869
870 /**
871 * create_hyp_exec_mappings - Map an executable range into HYP
872 * @phys_addr: The physical start address which gets mapped
873 * @size: Size of the region being mapped
874 * @haddr: HYP VA for this mapping
875 */
create_hyp_exec_mappings(phys_addr_t phys_addr,size_t size,void ** haddr)876 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
877 void **haddr)
878 {
879 unsigned long addr;
880 int ret;
881
882 BUG_ON(is_kernel_in_hyp_mode());
883
884 ret = __create_hyp_private_mapping(phys_addr, size,
885 &addr, PAGE_HYP_EXEC);
886 if (ret) {
887 *haddr = NULL;
888 return ret;
889 }
890
891 *haddr = (void *)addr;
892 return 0;
893 }
894
895 /**
896 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
897 * @kvm: The KVM struct pointer for the VM.
898 *
899 * Allocates only the stage-2 HW PGD level table(s) of size defined by
900 * stage2_pgd_size(kvm).
901 *
902 * Note we don't need locking here as this is only called when the VM is
903 * created, which can only be done once.
904 */
kvm_alloc_stage2_pgd(struct kvm * kvm)905 int kvm_alloc_stage2_pgd(struct kvm *kvm)
906 {
907 phys_addr_t pgd_phys;
908 pgd_t *pgd;
909
910 if (kvm->arch.pgd != NULL) {
911 kvm_err("kvm_arch already initialized?\n");
912 return -EINVAL;
913 }
914
915 /* Allocate the HW PGD, making sure that each page gets its own refcount */
916 pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
917 if (!pgd)
918 return -ENOMEM;
919
920 pgd_phys = virt_to_phys(pgd);
921 if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
922 return -EINVAL;
923
924 kvm->arch.pgd = pgd;
925 kvm->arch.pgd_phys = pgd_phys;
926 return 0;
927 }
928
stage2_unmap_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)929 static void stage2_unmap_memslot(struct kvm *kvm,
930 struct kvm_memory_slot *memslot)
931 {
932 hva_t hva = memslot->userspace_addr;
933 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
934 phys_addr_t size = PAGE_SIZE * memslot->npages;
935 hva_t reg_end = hva + size;
936
937 /*
938 * A memory region could potentially cover multiple VMAs, and any holes
939 * between them, so iterate over all of them to find out if we should
940 * unmap any of them.
941 *
942 * +--------------------------------------------+
943 * +---------------+----------------+ +----------------+
944 * | : VMA 1 | VMA 2 | | VMA 3 : |
945 * +---------------+----------------+ +----------------+
946 * | memory region |
947 * +--------------------------------------------+
948 */
949 do {
950 struct vm_area_struct *vma = find_vma(current->mm, hva);
951 hva_t vm_start, vm_end;
952
953 if (!vma || vma->vm_start >= reg_end)
954 break;
955
956 /*
957 * Take the intersection of this VMA with the memory region
958 */
959 vm_start = max(hva, vma->vm_start);
960 vm_end = min(reg_end, vma->vm_end);
961
962 if (!(vma->vm_flags & VM_PFNMAP)) {
963 gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
964 unmap_stage2_range(kvm, gpa, vm_end - vm_start);
965 }
966 hva = vm_end;
967 } while (hva < reg_end);
968 }
969
970 /**
971 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
972 * @kvm: The struct kvm pointer
973 *
974 * Go through the memregions and unmap any reguler RAM
975 * backing memory already mapped to the VM.
976 */
stage2_unmap_vm(struct kvm * kvm)977 void stage2_unmap_vm(struct kvm *kvm)
978 {
979 struct kvm_memslots *slots;
980 struct kvm_memory_slot *memslot;
981 int idx;
982
983 idx = srcu_read_lock(&kvm->srcu);
984 down_read(¤t->mm->mmap_sem);
985 spin_lock(&kvm->mmu_lock);
986
987 slots = kvm_memslots(kvm);
988 kvm_for_each_memslot(memslot, slots)
989 stage2_unmap_memslot(kvm, memslot);
990
991 spin_unlock(&kvm->mmu_lock);
992 up_read(¤t->mm->mmap_sem);
993 srcu_read_unlock(&kvm->srcu, idx);
994 }
995
996 /**
997 * kvm_free_stage2_pgd - free all stage-2 tables
998 * @kvm: The KVM struct pointer for the VM.
999 *
1000 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
1001 * underlying level-2 and level-3 tables before freeing the actual level-1 table
1002 * and setting the struct pointer to NULL.
1003 */
kvm_free_stage2_pgd(struct kvm * kvm)1004 void kvm_free_stage2_pgd(struct kvm *kvm)
1005 {
1006 void *pgd = NULL;
1007
1008 spin_lock(&kvm->mmu_lock);
1009 if (kvm->arch.pgd) {
1010 unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
1011 pgd = READ_ONCE(kvm->arch.pgd);
1012 kvm->arch.pgd = NULL;
1013 kvm->arch.pgd_phys = 0;
1014 }
1015 spin_unlock(&kvm->mmu_lock);
1016
1017 /* Free the HW pgd, one page at a time */
1018 if (pgd)
1019 free_pages_exact(pgd, stage2_pgd_size(kvm));
1020 }
1021
stage2_get_pud(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr)1022 static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1023 phys_addr_t addr)
1024 {
1025 pgd_t *pgd;
1026 pud_t *pud;
1027
1028 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
1029 if (stage2_pgd_none(kvm, *pgd)) {
1030 if (!cache)
1031 return NULL;
1032 pud = mmu_memory_cache_alloc(cache);
1033 stage2_pgd_populate(kvm, pgd, pud);
1034 get_page(virt_to_page(pgd));
1035 }
1036
1037 return stage2_pud_offset(kvm, pgd, addr);
1038 }
1039
stage2_get_pmd(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr)1040 static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1041 phys_addr_t addr)
1042 {
1043 pud_t *pud;
1044 pmd_t *pmd;
1045
1046 pud = stage2_get_pud(kvm, cache, addr);
1047 if (!pud || stage2_pud_huge(kvm, *pud))
1048 return NULL;
1049
1050 if (stage2_pud_none(kvm, *pud)) {
1051 if (!cache)
1052 return NULL;
1053 pmd = mmu_memory_cache_alloc(cache);
1054 stage2_pud_populate(kvm, pud, pmd);
1055 get_page(virt_to_page(pud));
1056 }
1057
1058 return stage2_pmd_offset(kvm, pud, addr);
1059 }
1060
stage2_set_pmd_huge(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pmd_t * new_pmd)1061 static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1062 *cache, phys_addr_t addr, const pmd_t *new_pmd)
1063 {
1064 pmd_t *pmd, old_pmd;
1065
1066 retry:
1067 pmd = stage2_get_pmd(kvm, cache, addr);
1068 VM_BUG_ON(!pmd);
1069
1070 old_pmd = *pmd;
1071 /*
1072 * Multiple vcpus faulting on the same PMD entry, can
1073 * lead to them sequentially updating the PMD with the
1074 * same value. Following the break-before-make
1075 * (pmd_clear() followed by tlb_flush()) process can
1076 * hinder forward progress due to refaults generated
1077 * on missing translations.
1078 *
1079 * Skip updating the page table if the entry is
1080 * unchanged.
1081 */
1082 if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1083 return 0;
1084
1085 if (pmd_present(old_pmd)) {
1086 /*
1087 * If we already have PTE level mapping for this block,
1088 * we must unmap it to avoid inconsistent TLB state and
1089 * leaking the table page. We could end up in this situation
1090 * if the memory slot was marked for dirty logging and was
1091 * reverted, leaving PTE level mappings for the pages accessed
1092 * during the period. So, unmap the PTE level mapping for this
1093 * block and retry, as we could have released the upper level
1094 * table in the process.
1095 *
1096 * Normal THP split/merge follows mmu_notifier callbacks and do
1097 * get handled accordingly.
1098 */
1099 if (!pmd_thp_or_huge(old_pmd)) {
1100 unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
1101 goto retry;
1102 }
1103 /*
1104 * Mapping in huge pages should only happen through a
1105 * fault. If a page is merged into a transparent huge
1106 * page, the individual subpages of that huge page
1107 * should be unmapped through MMU notifiers before we
1108 * get here.
1109 *
1110 * Merging of CompoundPages is not supported; they
1111 * should become splitting first, unmapped, merged,
1112 * and mapped back in on-demand.
1113 */
1114 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
1115 pmd_clear(pmd);
1116 kvm_tlb_flush_vmid_ipa(kvm, addr);
1117 } else {
1118 get_page(virt_to_page(pmd));
1119 }
1120
1121 kvm_set_pmd(pmd, *new_pmd);
1122 return 0;
1123 }
1124
stage2_set_pud_huge(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pud_t * new_pudp)1125 static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1126 phys_addr_t addr, const pud_t *new_pudp)
1127 {
1128 pud_t *pudp, old_pud;
1129
1130 retry:
1131 pudp = stage2_get_pud(kvm, cache, addr);
1132 VM_BUG_ON(!pudp);
1133
1134 old_pud = *pudp;
1135
1136 /*
1137 * A large number of vcpus faulting on the same stage 2 entry,
1138 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1139 * Skip updating the page tables if there is no change.
1140 */
1141 if (pud_val(old_pud) == pud_val(*new_pudp))
1142 return 0;
1143
1144 if (stage2_pud_present(kvm, old_pud)) {
1145 /*
1146 * If we already have table level mapping for this block, unmap
1147 * the range for this block and retry.
1148 */
1149 if (!stage2_pud_huge(kvm, old_pud)) {
1150 unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
1151 goto retry;
1152 }
1153
1154 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
1155 stage2_pud_clear(kvm, pudp);
1156 kvm_tlb_flush_vmid_ipa(kvm, addr);
1157 } else {
1158 get_page(virt_to_page(pudp));
1159 }
1160
1161 kvm_set_pud(pudp, *new_pudp);
1162 return 0;
1163 }
1164
1165 /*
1166 * stage2_get_leaf_entry - walk the stage2 VM page tables and return
1167 * true if a valid and present leaf-entry is found. A pointer to the
1168 * leaf-entry is returned in the appropriate level variable - pudpp,
1169 * pmdpp, ptepp.
1170 */
stage2_get_leaf_entry(struct kvm * kvm,phys_addr_t addr,pud_t ** pudpp,pmd_t ** pmdpp,pte_t ** ptepp)1171 static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
1172 pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
1173 {
1174 pud_t *pudp;
1175 pmd_t *pmdp;
1176 pte_t *ptep;
1177
1178 *pudpp = NULL;
1179 *pmdpp = NULL;
1180 *ptepp = NULL;
1181
1182 pudp = stage2_get_pud(kvm, NULL, addr);
1183 if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1184 return false;
1185
1186 if (stage2_pud_huge(kvm, *pudp)) {
1187 *pudpp = pudp;
1188 return true;
1189 }
1190
1191 pmdp = stage2_pmd_offset(kvm, pudp, addr);
1192 if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1193 return false;
1194
1195 if (pmd_thp_or_huge(*pmdp)) {
1196 *pmdpp = pmdp;
1197 return true;
1198 }
1199
1200 ptep = pte_offset_kernel(pmdp, addr);
1201 if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1202 return false;
1203
1204 *ptepp = ptep;
1205 return true;
1206 }
1207
stage2_is_exec(struct kvm * kvm,phys_addr_t addr,unsigned long sz)1208 static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr, unsigned long sz)
1209 {
1210 pud_t *pudp;
1211 pmd_t *pmdp;
1212 pte_t *ptep;
1213 bool found;
1214
1215 found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
1216 if (!found)
1217 return false;
1218
1219 if (pudp)
1220 return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
1221 else if (pmdp)
1222 return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
1223 else
1224 return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
1225 }
1226
stage2_set_pte(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pte_t * new_pte,unsigned long flags)1227 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1228 phys_addr_t addr, const pte_t *new_pte,
1229 unsigned long flags)
1230 {
1231 pud_t *pud;
1232 pmd_t *pmd;
1233 pte_t *pte, old_pte;
1234 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1235 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1236
1237 VM_BUG_ON(logging_active && !cache);
1238
1239 /* Create stage-2 page table mapping - Levels 0 and 1 */
1240 pud = stage2_get_pud(kvm, cache, addr);
1241 if (!pud) {
1242 /*
1243 * Ignore calls from kvm_set_spte_hva for unallocated
1244 * address ranges.
1245 */
1246 return 0;
1247 }
1248
1249 /*
1250 * While dirty page logging - dissolve huge PUD, then continue
1251 * on to allocate page.
1252 */
1253 if (logging_active)
1254 stage2_dissolve_pud(kvm, addr, pud);
1255
1256 if (stage2_pud_none(kvm, *pud)) {
1257 if (!cache)
1258 return 0; /* ignore calls from kvm_set_spte_hva */
1259 pmd = mmu_memory_cache_alloc(cache);
1260 stage2_pud_populate(kvm, pud, pmd);
1261 get_page(virt_to_page(pud));
1262 }
1263
1264 pmd = stage2_pmd_offset(kvm, pud, addr);
1265 if (!pmd) {
1266 /*
1267 * Ignore calls from kvm_set_spte_hva for unallocated
1268 * address ranges.
1269 */
1270 return 0;
1271 }
1272
1273 /*
1274 * While dirty page logging - dissolve huge PMD, then continue on to
1275 * allocate page.
1276 */
1277 if (logging_active)
1278 stage2_dissolve_pmd(kvm, addr, pmd);
1279
1280 /* Create stage-2 page mappings - Level 2 */
1281 if (pmd_none(*pmd)) {
1282 if (!cache)
1283 return 0; /* ignore calls from kvm_set_spte_hva */
1284 pte = mmu_memory_cache_alloc(cache);
1285 kvm_pmd_populate(pmd, pte);
1286 get_page(virt_to_page(pmd));
1287 }
1288
1289 pte = pte_offset_kernel(pmd, addr);
1290
1291 if (iomap && pte_present(*pte))
1292 return -EFAULT;
1293
1294 /* Create 2nd stage page table mapping - Level 3 */
1295 old_pte = *pte;
1296 if (pte_present(old_pte)) {
1297 /* Skip page table update if there is no change */
1298 if (pte_val(old_pte) == pte_val(*new_pte))
1299 return 0;
1300
1301 kvm_set_pte(pte, __pte(0));
1302 kvm_tlb_flush_vmid_ipa(kvm, addr);
1303 } else {
1304 get_page(virt_to_page(pte));
1305 }
1306
1307 kvm_set_pte(pte, *new_pte);
1308 return 0;
1309 }
1310
1311 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
stage2_ptep_test_and_clear_young(pte_t * pte)1312 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1313 {
1314 if (pte_young(*pte)) {
1315 *pte = pte_mkold(*pte);
1316 return 1;
1317 }
1318 return 0;
1319 }
1320 #else
stage2_ptep_test_and_clear_young(pte_t * pte)1321 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1322 {
1323 return __ptep_test_and_clear_young(pte);
1324 }
1325 #endif
1326
stage2_pmdp_test_and_clear_young(pmd_t * pmd)1327 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1328 {
1329 return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1330 }
1331
stage2_pudp_test_and_clear_young(pud_t * pud)1332 static int stage2_pudp_test_and_clear_young(pud_t *pud)
1333 {
1334 return stage2_ptep_test_and_clear_young((pte_t *)pud);
1335 }
1336
1337 /**
1338 * kvm_phys_addr_ioremap - map a device range to guest IPA
1339 *
1340 * @kvm: The KVM pointer
1341 * @guest_ipa: The IPA at which to insert the mapping
1342 * @pa: The physical address of the device
1343 * @size: The size of the mapping
1344 */
kvm_phys_addr_ioremap(struct kvm * kvm,phys_addr_t guest_ipa,phys_addr_t pa,unsigned long size,bool writable)1345 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1346 phys_addr_t pa, unsigned long size, bool writable)
1347 {
1348 phys_addr_t addr, end;
1349 int ret = 0;
1350 unsigned long pfn;
1351 struct kvm_mmu_memory_cache cache = { 0, };
1352
1353 end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
1354 pfn = __phys_to_pfn(pa);
1355
1356 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
1357 pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
1358
1359 if (writable)
1360 pte = kvm_s2pte_mkwrite(pte);
1361
1362 ret = mmu_topup_memory_cache(&cache,
1363 kvm_mmu_cache_min_pages(kvm),
1364 KVM_NR_MEM_OBJS);
1365 if (ret)
1366 goto out;
1367 spin_lock(&kvm->mmu_lock);
1368 ret = stage2_set_pte(kvm, &cache, addr, &pte,
1369 KVM_S2PTE_FLAG_IS_IOMAP);
1370 spin_unlock(&kvm->mmu_lock);
1371 if (ret)
1372 goto out;
1373
1374 pfn++;
1375 }
1376
1377 out:
1378 mmu_free_memory_cache(&cache);
1379 return ret;
1380 }
1381
transparent_hugepage_adjust(kvm_pfn_t * pfnp,phys_addr_t * ipap)1382 static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
1383 {
1384 kvm_pfn_t pfn = *pfnp;
1385 gfn_t gfn = *ipap >> PAGE_SHIFT;
1386 struct page *page = pfn_to_page(pfn);
1387
1388 /*
1389 * PageTransCompoundMap() returns true for THP and
1390 * hugetlbfs. Make sure the adjustment is done only for THP
1391 * pages.
1392 */
1393 if (!PageHuge(page) && PageTransCompoundMap(page)) {
1394 unsigned long mask;
1395 /*
1396 * The address we faulted on is backed by a transparent huge
1397 * page. However, because we map the compound huge page and
1398 * not the individual tail page, we need to transfer the
1399 * refcount to the head page. We have to be careful that the
1400 * THP doesn't start to split while we are adjusting the
1401 * refcounts.
1402 *
1403 * We are sure this doesn't happen, because mmu_notifier_retry
1404 * was successful and we are holding the mmu_lock, so if this
1405 * THP is trying to split, it will be blocked in the mmu
1406 * notifier before touching any of the pages, specifically
1407 * before being able to call __split_huge_page_refcount().
1408 *
1409 * We can therefore safely transfer the refcount from PG_tail
1410 * to PG_head and switch the pfn from a tail page to the head
1411 * page accordingly.
1412 */
1413 mask = PTRS_PER_PMD - 1;
1414 VM_BUG_ON((gfn & mask) != (pfn & mask));
1415 if (pfn & mask) {
1416 *ipap &= PMD_MASK;
1417 kvm_release_pfn_clean(pfn);
1418 pfn &= ~mask;
1419 kvm_get_pfn(pfn);
1420 *pfnp = pfn;
1421 }
1422
1423 return true;
1424 }
1425
1426 return false;
1427 }
1428
1429 /**
1430 * stage2_wp_ptes - write protect PMD range
1431 * @pmd: pointer to pmd entry
1432 * @addr: range start address
1433 * @end: range end address
1434 */
stage2_wp_ptes(pmd_t * pmd,phys_addr_t addr,phys_addr_t end)1435 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1436 {
1437 pte_t *pte;
1438
1439 pte = pte_offset_kernel(pmd, addr);
1440 do {
1441 if (!pte_none(*pte)) {
1442 if (!kvm_s2pte_readonly(pte))
1443 kvm_set_s2pte_readonly(pte);
1444 }
1445 } while (pte++, addr += PAGE_SIZE, addr != end);
1446 }
1447
1448 /**
1449 * stage2_wp_pmds - write protect PUD range
1450 * kvm: kvm instance for the VM
1451 * @pud: pointer to pud entry
1452 * @addr: range start address
1453 * @end: range end address
1454 */
stage2_wp_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)1455 static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
1456 phys_addr_t addr, phys_addr_t end)
1457 {
1458 pmd_t *pmd;
1459 phys_addr_t next;
1460
1461 pmd = stage2_pmd_offset(kvm, pud, addr);
1462
1463 do {
1464 next = stage2_pmd_addr_end(kvm, addr, end);
1465 if (!pmd_none(*pmd)) {
1466 if (pmd_thp_or_huge(*pmd)) {
1467 if (!kvm_s2pmd_readonly(pmd))
1468 kvm_set_s2pmd_readonly(pmd);
1469 } else {
1470 stage2_wp_ptes(pmd, addr, next);
1471 }
1472 }
1473 } while (pmd++, addr = next, addr != end);
1474 }
1475
1476 /**
1477 * stage2_wp_puds - write protect PGD range
1478 * @pgd: pointer to pgd entry
1479 * @addr: range start address
1480 * @end: range end address
1481 */
stage2_wp_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)1482 static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
1483 phys_addr_t addr, phys_addr_t end)
1484 {
1485 pud_t *pud;
1486 phys_addr_t next;
1487
1488 pud = stage2_pud_offset(kvm, pgd, addr);
1489 do {
1490 next = stage2_pud_addr_end(kvm, addr, end);
1491 if (!stage2_pud_none(kvm, *pud)) {
1492 if (stage2_pud_huge(kvm, *pud)) {
1493 if (!kvm_s2pud_readonly(pud))
1494 kvm_set_s2pud_readonly(pud);
1495 } else {
1496 stage2_wp_pmds(kvm, pud, addr, next);
1497 }
1498 }
1499 } while (pud++, addr = next, addr != end);
1500 }
1501
1502 /**
1503 * stage2_wp_range() - write protect stage2 memory region range
1504 * @kvm: The KVM pointer
1505 * @addr: Start address of range
1506 * @end: End address of range
1507 */
stage2_wp_range(struct kvm * kvm,phys_addr_t addr,phys_addr_t end)1508 static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1509 {
1510 pgd_t *pgd;
1511 phys_addr_t next;
1512
1513 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
1514 do {
1515 /*
1516 * Release kvm_mmu_lock periodically if the memory region is
1517 * large. Otherwise, we may see kernel panics with
1518 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1519 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1520 * will also starve other vCPUs. We have to also make sure
1521 * that the page tables are not freed while we released
1522 * the lock.
1523 */
1524 cond_resched_lock(&kvm->mmu_lock);
1525 if (!READ_ONCE(kvm->arch.pgd))
1526 break;
1527 next = stage2_pgd_addr_end(kvm, addr, end);
1528 if (stage2_pgd_present(kvm, *pgd))
1529 stage2_wp_puds(kvm, pgd, addr, next);
1530 } while (pgd++, addr = next, addr != end);
1531 }
1532
1533 /**
1534 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1535 * @kvm: The KVM pointer
1536 * @slot: The memory slot to write protect
1537 *
1538 * Called to start logging dirty pages after memory region
1539 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1540 * all present PUD, PMD and PTEs are write protected in the memory region.
1541 * Afterwards read of dirty page log can be called.
1542 *
1543 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1544 * serializing operations for VM memory regions.
1545 */
kvm_mmu_wp_memory_region(struct kvm * kvm,int slot)1546 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1547 {
1548 struct kvm_memslots *slots = kvm_memslots(kvm);
1549 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1550 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
1551 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1552
1553 spin_lock(&kvm->mmu_lock);
1554 stage2_wp_range(kvm, start, end);
1555 spin_unlock(&kvm->mmu_lock);
1556 kvm_flush_remote_tlbs(kvm);
1557 }
1558
1559 /**
1560 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1561 * @kvm: The KVM pointer
1562 * @slot: The memory slot associated with mask
1563 * @gfn_offset: The gfn offset in memory slot
1564 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
1565 * slot to be write protected
1566 *
1567 * Walks bits set in mask write protects the associated pte's. Caller must
1568 * acquire kvm_mmu_lock.
1569 */
kvm_mmu_write_protect_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)1570 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1571 struct kvm_memory_slot *slot,
1572 gfn_t gfn_offset, unsigned long mask)
1573 {
1574 phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1575 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
1576 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1577
1578 stage2_wp_range(kvm, start, end);
1579 }
1580
1581 /*
1582 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1583 * dirty pages.
1584 *
1585 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1586 * enable dirty logging for them.
1587 */
kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)1588 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1589 struct kvm_memory_slot *slot,
1590 gfn_t gfn_offset, unsigned long mask)
1591 {
1592 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1593 }
1594
clean_dcache_guest_page(kvm_pfn_t pfn,unsigned long size)1595 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1596 {
1597 __clean_dcache_guest_page(pfn, size);
1598 }
1599
invalidate_icache_guest_page(kvm_pfn_t pfn,unsigned long size)1600 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1601 {
1602 __invalidate_icache_guest_page(pfn, size);
1603 }
1604
kvm_send_hwpoison_signal(unsigned long address,struct vm_area_struct * vma)1605 static void kvm_send_hwpoison_signal(unsigned long address,
1606 struct vm_area_struct *vma)
1607 {
1608 short lsb;
1609
1610 if (is_vm_hugetlb_page(vma))
1611 lsb = huge_page_shift(hstate_vma(vma));
1612 else
1613 lsb = PAGE_SHIFT;
1614
1615 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1616 }
1617
fault_supports_stage2_huge_mapping(struct kvm_memory_slot * memslot,unsigned long hva,unsigned long map_size)1618 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1619 unsigned long hva,
1620 unsigned long map_size)
1621 {
1622 gpa_t gpa_start;
1623 hva_t uaddr_start, uaddr_end;
1624 size_t size;
1625
1626 size = memslot->npages * PAGE_SIZE;
1627
1628 gpa_start = memslot->base_gfn << PAGE_SHIFT;
1629
1630 uaddr_start = memslot->userspace_addr;
1631 uaddr_end = uaddr_start + size;
1632
1633 /*
1634 * Pages belonging to memslots that don't have the same alignment
1635 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1636 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1637 *
1638 * Consider a layout like the following:
1639 *
1640 * memslot->userspace_addr:
1641 * +-----+--------------------+--------------------+---+
1642 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
1643 * +-----+--------------------+--------------------+---+
1644 *
1645 * memslot->base_gfn << PAGE_SIZE:
1646 * +---+--------------------+--------------------+-----+
1647 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
1648 * +---+--------------------+--------------------+-----+
1649 *
1650 * If we create those stage-2 blocks, we'll end up with this incorrect
1651 * mapping:
1652 * d -> f
1653 * e -> g
1654 * f -> h
1655 */
1656 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1657 return false;
1658
1659 /*
1660 * Next, let's make sure we're not trying to map anything not covered
1661 * by the memslot. This means we have to prohibit block size mappings
1662 * for the beginning and end of a non-block aligned and non-block sized
1663 * memory slot (illustrated by the head and tail parts of the
1664 * userspace view above containing pages 'abcde' and 'xyz',
1665 * respectively).
1666 *
1667 * Note that it doesn't matter if we do the check using the
1668 * userspace_addr or the base_gfn, as both are equally aligned (per
1669 * the check above) and equally sized.
1670 */
1671 return (hva & ~(map_size - 1)) >= uaddr_start &&
1672 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1673 }
1674
user_mem_abort(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,struct kvm_memory_slot * memslot,unsigned long hva,unsigned long fault_status)1675 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1676 struct kvm_memory_slot *memslot, unsigned long hva,
1677 unsigned long fault_status)
1678 {
1679 int ret;
1680 bool write_fault, writable, force_pte = false;
1681 bool exec_fault, needs_exec;
1682 unsigned long mmu_seq;
1683 gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1684 struct kvm *kvm = vcpu->kvm;
1685 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1686 struct vm_area_struct *vma;
1687 kvm_pfn_t pfn;
1688 pgprot_t mem_type = PAGE_S2;
1689 bool logging_active = memslot_is_logging(memslot);
1690 unsigned long vma_pagesize, flags = 0;
1691
1692 write_fault = kvm_is_write_fault(vcpu);
1693 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
1694 VM_BUG_ON(write_fault && exec_fault);
1695
1696 if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1697 kvm_err("Unexpected L2 read permission error\n");
1698 return -EFAULT;
1699 }
1700
1701 /* Let's check if we will get back a huge page backed by hugetlbfs */
1702 down_read(¤t->mm->mmap_sem);
1703 vma = find_vma_intersection(current->mm, hva, hva + 1);
1704 if (unlikely(!vma)) {
1705 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1706 up_read(¤t->mm->mmap_sem);
1707 return -EFAULT;
1708 }
1709
1710 vma_pagesize = vma_kernel_pagesize(vma);
1711 if (logging_active ||
1712 (vma->vm_flags & VM_PFNMAP) ||
1713 !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1714 force_pte = true;
1715 vma_pagesize = PAGE_SIZE;
1716 }
1717
1718 /*
1719 * The stage2 has a minimum of 2 level table (For arm64 see
1720 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1721 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1722 * As for PUD huge maps, we must make sure that we have at least
1723 * 3 levels, i.e, PMD is not folded.
1724 */
1725 if (vma_pagesize == PMD_SIZE ||
1726 (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1727 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1728 up_read(¤t->mm->mmap_sem);
1729
1730 /* We need minimum second+third level pages */
1731 ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
1732 KVM_NR_MEM_OBJS);
1733 if (ret)
1734 return ret;
1735
1736 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1737 /*
1738 * Ensure the read of mmu_notifier_seq happens before we call
1739 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1740 * the page we just got a reference to gets unmapped before we have a
1741 * chance to grab the mmu_lock, which ensure that if the page gets
1742 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1743 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1744 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1745 */
1746 smp_rmb();
1747
1748 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1749 if (pfn == KVM_PFN_ERR_HWPOISON) {
1750 kvm_send_hwpoison_signal(hva, vma);
1751 return 0;
1752 }
1753 if (is_error_noslot_pfn(pfn))
1754 return -EFAULT;
1755
1756 if (kvm_is_device_pfn(pfn)) {
1757 mem_type = PAGE_S2_DEVICE;
1758 flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1759 force_pte = true;
1760 } else if (logging_active) {
1761 /*
1762 * Faults on pages in a memslot with logging enabled
1763 * should not be mapped with huge pages (it introduces churn
1764 * and performance degradation), so force a pte mapping.
1765 */
1766 flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1767
1768 /*
1769 * Only actually map the page as writable if this was a write
1770 * fault.
1771 */
1772 if (!write_fault)
1773 writable = false;
1774 }
1775
1776 if (exec_fault && is_iomap(flags))
1777 return -ENOEXEC;
1778
1779 spin_lock(&kvm->mmu_lock);
1780 if (mmu_notifier_retry(kvm, mmu_seq))
1781 goto out_unlock;
1782
1783 if (vma_pagesize == PAGE_SIZE && !force_pte) {
1784 /*
1785 * Only PMD_SIZE transparent hugepages(THP) are
1786 * currently supported. This code will need to be
1787 * updated to support other THP sizes.
1788 *
1789 * Make sure the host VA and the guest IPA are sufficiently
1790 * aligned and that the block is contained within the memslot.
1791 */
1792 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
1793 transparent_hugepage_adjust(&pfn, &fault_ipa))
1794 vma_pagesize = PMD_SIZE;
1795 }
1796
1797 if (writable)
1798 kvm_set_pfn_dirty(pfn);
1799
1800 if (fault_status != FSC_PERM && !is_iomap(flags))
1801 clean_dcache_guest_page(pfn, vma_pagesize);
1802
1803 if (exec_fault)
1804 invalidate_icache_guest_page(pfn, vma_pagesize);
1805
1806 /*
1807 * If we took an execution fault we have made the
1808 * icache/dcache coherent above and should now let the s2
1809 * mapping be executable.
1810 *
1811 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1812 * execute permissions, and we preserve whatever we have.
1813 */
1814 needs_exec = exec_fault ||
1815 (fault_status == FSC_PERM &&
1816 stage2_is_exec(kvm, fault_ipa, vma_pagesize));
1817
1818 /*
1819 * If PUD_SIZE == PMD_SIZE, there is no real PUD level, and
1820 * all we have is a 2-level page table. Trying to map a PUD in
1821 * this case would be fatally wrong.
1822 */
1823 if (PUD_SIZE != PMD_SIZE && vma_pagesize == PUD_SIZE) {
1824 pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1825
1826 new_pud = kvm_pud_mkhuge(new_pud);
1827 if (writable)
1828 new_pud = kvm_s2pud_mkwrite(new_pud);
1829
1830 if (needs_exec)
1831 new_pud = kvm_s2pud_mkexec(new_pud);
1832
1833 ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
1834 } else if (vma_pagesize == PMD_SIZE) {
1835 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1836
1837 new_pmd = kvm_pmd_mkhuge(new_pmd);
1838
1839 if (writable)
1840 new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1841
1842 if (needs_exec)
1843 new_pmd = kvm_s2pmd_mkexec(new_pmd);
1844
1845 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1846 } else {
1847 pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1848
1849 if (writable) {
1850 new_pte = kvm_s2pte_mkwrite(new_pte);
1851 mark_page_dirty(kvm, gfn);
1852 }
1853
1854 if (needs_exec)
1855 new_pte = kvm_s2pte_mkexec(new_pte);
1856
1857 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1858 }
1859
1860 out_unlock:
1861 spin_unlock(&kvm->mmu_lock);
1862 kvm_set_pfn_accessed(pfn);
1863 kvm_release_pfn_clean(pfn);
1864 return ret;
1865 }
1866
1867 /*
1868 * Resolve the access fault by making the page young again.
1869 * Note that because the faulting entry is guaranteed not to be
1870 * cached in the TLB, we don't need to invalidate anything.
1871 * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
1872 * so there is no need for atomic (pte|pmd)_mkyoung operations.
1873 */
handle_access_fault(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa)1874 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1875 {
1876 pud_t *pud;
1877 pmd_t *pmd;
1878 pte_t *pte;
1879 kvm_pfn_t pfn;
1880 bool pfn_valid = false;
1881
1882 trace_kvm_access_fault(fault_ipa);
1883
1884 spin_lock(&vcpu->kvm->mmu_lock);
1885
1886 if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
1887 goto out;
1888
1889 if (pud) { /* HugeTLB */
1890 *pud = kvm_s2pud_mkyoung(*pud);
1891 pfn = kvm_pud_pfn(*pud);
1892 pfn_valid = true;
1893 } else if (pmd) { /* THP, HugeTLB */
1894 *pmd = pmd_mkyoung(*pmd);
1895 pfn = pmd_pfn(*pmd);
1896 pfn_valid = true;
1897 } else {
1898 *pte = pte_mkyoung(*pte); /* Just a page... */
1899 pfn = pte_pfn(*pte);
1900 pfn_valid = true;
1901 }
1902
1903 out:
1904 spin_unlock(&vcpu->kvm->mmu_lock);
1905 if (pfn_valid)
1906 kvm_set_pfn_accessed(pfn);
1907 }
1908
1909 /**
1910 * kvm_handle_guest_abort - handles all 2nd stage aborts
1911 * @vcpu: the VCPU pointer
1912 * @run: the kvm_run structure
1913 *
1914 * Any abort that gets to the host is almost guaranteed to be caused by a
1915 * missing second stage translation table entry, which can mean that either the
1916 * guest simply needs more memory and we must allocate an appropriate page or it
1917 * can mean that the guest tried to access I/O memory, which is emulated by user
1918 * space. The distinction is based on the IPA causing the fault and whether this
1919 * memory region has been registered as standard RAM by user space.
1920 */
kvm_handle_guest_abort(struct kvm_vcpu * vcpu,struct kvm_run * run)1921 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
1922 {
1923 unsigned long fault_status;
1924 phys_addr_t fault_ipa;
1925 struct kvm_memory_slot *memslot;
1926 unsigned long hva;
1927 bool is_iabt, write_fault, writable;
1928 gfn_t gfn;
1929 int ret, idx;
1930
1931 fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1932
1933 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1934 is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1935
1936 /* Synchronous External Abort? */
1937 if (kvm_vcpu_dabt_isextabt(vcpu)) {
1938 /*
1939 * For RAS the host kernel may handle this abort.
1940 * There is no need to pass the error into the guest.
1941 */
1942 if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
1943 return 1;
1944
1945 if (unlikely(!is_iabt)) {
1946 kvm_inject_vabt(vcpu);
1947 return 1;
1948 }
1949 }
1950
1951 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
1952 kvm_vcpu_get_hfar(vcpu), fault_ipa);
1953
1954 /* Check the stage-2 fault is trans. fault or write fault */
1955 if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1956 fault_status != FSC_ACCESS) {
1957 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1958 kvm_vcpu_trap_get_class(vcpu),
1959 (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1960 (unsigned long)kvm_vcpu_get_hsr(vcpu));
1961 return -EFAULT;
1962 }
1963
1964 idx = srcu_read_lock(&vcpu->kvm->srcu);
1965
1966 gfn = fault_ipa >> PAGE_SHIFT;
1967 memslot = gfn_to_memslot(vcpu->kvm, gfn);
1968 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1969 write_fault = kvm_is_write_fault(vcpu);
1970 if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1971 if (is_iabt) {
1972 /* Prefetch Abort on I/O address */
1973 ret = -ENOEXEC;
1974 goto out;
1975 }
1976
1977 /*
1978 * Check for a cache maintenance operation. Since we
1979 * ended-up here, we know it is outside of any memory
1980 * slot. But we can't find out if that is for a device,
1981 * or if the guest is just being stupid. The only thing
1982 * we know for sure is that this range cannot be cached.
1983 *
1984 * So let's assume that the guest is just being
1985 * cautious, and skip the instruction.
1986 */
1987 if (kvm_vcpu_dabt_is_cm(vcpu)) {
1988 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1989 ret = 1;
1990 goto out_unlock;
1991 }
1992
1993 /*
1994 * The IPA is reported as [MAX:12], so we need to
1995 * complement it with the bottom 12 bits from the
1996 * faulting VA. This is always 12 bits, irrespective
1997 * of the page size.
1998 */
1999 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
2000 ret = io_mem_abort(vcpu, run, fault_ipa);
2001 goto out_unlock;
2002 }
2003
2004 /* Userspace should not be able to register out-of-bounds IPAs */
2005 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
2006
2007 if (fault_status == FSC_ACCESS) {
2008 handle_access_fault(vcpu, fault_ipa);
2009 ret = 1;
2010 goto out_unlock;
2011 }
2012
2013 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
2014 if (ret == 0)
2015 ret = 1;
2016 out:
2017 if (ret == -ENOEXEC) {
2018 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2019 ret = 1;
2020 }
2021 out_unlock:
2022 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2023 return ret;
2024 }
2025
handle_hva_to_gpa(struct kvm * kvm,unsigned long start,unsigned long end,int (* handler)(struct kvm * kvm,gpa_t gpa,u64 size,void * data),void * data)2026 static int handle_hva_to_gpa(struct kvm *kvm,
2027 unsigned long start,
2028 unsigned long end,
2029 int (*handler)(struct kvm *kvm,
2030 gpa_t gpa, u64 size,
2031 void *data),
2032 void *data)
2033 {
2034 struct kvm_memslots *slots;
2035 struct kvm_memory_slot *memslot;
2036 int ret = 0;
2037
2038 slots = kvm_memslots(kvm);
2039
2040 /* we only care about the pages that the guest sees */
2041 kvm_for_each_memslot(memslot, slots) {
2042 unsigned long hva_start, hva_end;
2043 gfn_t gpa;
2044
2045 hva_start = max(start, memslot->userspace_addr);
2046 hva_end = min(end, memslot->userspace_addr +
2047 (memslot->npages << PAGE_SHIFT));
2048 if (hva_start >= hva_end)
2049 continue;
2050
2051 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
2052 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
2053 }
2054
2055 return ret;
2056 }
2057
kvm_unmap_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2058 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2059 {
2060 unsigned flags = *(unsigned *)data;
2061 bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
2062
2063 __unmap_stage2_range(kvm, gpa, size, may_block);
2064 return 0;
2065 }
2066
kvm_unmap_hva_range(struct kvm * kvm,unsigned long start,unsigned long end,unsigned flags)2067 int kvm_unmap_hva_range(struct kvm *kvm,
2068 unsigned long start, unsigned long end, unsigned flags)
2069 {
2070 if (!kvm->arch.pgd)
2071 return 0;
2072
2073 trace_kvm_unmap_hva_range(start, end);
2074 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
2075 return 0;
2076 }
2077
kvm_set_spte_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2078 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2079 {
2080 pte_t *pte = (pte_t *)data;
2081
2082 WARN_ON(size != PAGE_SIZE);
2083 /*
2084 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
2085 * flag clear because MMU notifiers will have unmapped a huge PMD before
2086 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
2087 * therefore stage2_set_pte() never needs to clear out a huge PMD
2088 * through this calling path.
2089 */
2090 stage2_set_pte(kvm, NULL, gpa, pte, 0);
2091 return 0;
2092 }
2093
2094
kvm_set_spte_hva(struct kvm * kvm,unsigned long hva,pte_t pte)2095 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
2096 {
2097 unsigned long end = hva + PAGE_SIZE;
2098 kvm_pfn_t pfn = pte_pfn(pte);
2099 pte_t stage2_pte;
2100
2101 if (!kvm->arch.pgd)
2102 return 0;
2103
2104 trace_kvm_set_spte_hva(hva);
2105
2106 /*
2107 * We've moved a page around, probably through CoW, so let's treat it
2108 * just like a translation fault and clean the cache to the PoC.
2109 */
2110 clean_dcache_guest_page(pfn, PAGE_SIZE);
2111 stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
2112 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
2113
2114 return 0;
2115 }
2116
kvm_age_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2117 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2118 {
2119 pud_t *pud;
2120 pmd_t *pmd;
2121 pte_t *pte;
2122
2123 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2124 if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
2125 return 0;
2126
2127 if (pud)
2128 return stage2_pudp_test_and_clear_young(pud);
2129 else if (pmd)
2130 return stage2_pmdp_test_and_clear_young(pmd);
2131 else
2132 return stage2_ptep_test_and_clear_young(pte);
2133 }
2134
kvm_test_age_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2135 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2136 {
2137 pud_t *pud;
2138 pmd_t *pmd;
2139 pte_t *pte;
2140
2141 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2142 if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
2143 return 0;
2144
2145 if (pud)
2146 return kvm_s2pud_young(*pud);
2147 else if (pmd)
2148 return pmd_young(*pmd);
2149 else
2150 return pte_young(*pte);
2151 }
2152
kvm_age_hva(struct kvm * kvm,unsigned long start,unsigned long end)2153 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2154 {
2155 if (!kvm->arch.pgd)
2156 return 0;
2157 trace_kvm_age_hva(start, end);
2158 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
2159 }
2160
kvm_test_age_hva(struct kvm * kvm,unsigned long hva)2161 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2162 {
2163 if (!kvm->arch.pgd)
2164 return 0;
2165 trace_kvm_test_age_hva(hva);
2166 return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
2167 kvm_test_age_hva_handler, NULL);
2168 }
2169
kvm_mmu_free_memory_caches(struct kvm_vcpu * vcpu)2170 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
2171 {
2172 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
2173 }
2174
kvm_mmu_get_httbr(void)2175 phys_addr_t kvm_mmu_get_httbr(void)
2176 {
2177 if (__kvm_cpu_uses_extended_idmap())
2178 return virt_to_phys(merged_hyp_pgd);
2179 else
2180 return virt_to_phys(hyp_pgd);
2181 }
2182
kvm_get_idmap_vector(void)2183 phys_addr_t kvm_get_idmap_vector(void)
2184 {
2185 return hyp_idmap_vector;
2186 }
2187
kvm_map_idmap_text(pgd_t * pgd)2188 static int kvm_map_idmap_text(pgd_t *pgd)
2189 {
2190 int err;
2191
2192 /* Create the idmap in the boot page tables */
2193 err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
2194 hyp_idmap_start, hyp_idmap_end,
2195 __phys_to_pfn(hyp_idmap_start),
2196 PAGE_HYP_EXEC);
2197 if (err)
2198 kvm_err("Failed to idmap %lx-%lx\n",
2199 hyp_idmap_start, hyp_idmap_end);
2200
2201 return err;
2202 }
2203
kvm_mmu_init(void)2204 int kvm_mmu_init(void)
2205 {
2206 int err;
2207
2208 hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
2209 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2210 hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
2211 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2212 hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
2213
2214 /*
2215 * We rely on the linker script to ensure at build time that the HYP
2216 * init code does not cross a page boundary.
2217 */
2218 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2219
2220 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2221 kvm_debug("HYP VA range: %lx:%lx\n",
2222 kern_hyp_va(PAGE_OFFSET),
2223 kern_hyp_va((unsigned long)high_memory - 1));
2224
2225 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2226 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
2227 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2228 /*
2229 * The idmap page is intersecting with the VA space,
2230 * it is not safe to continue further.
2231 */
2232 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2233 err = -EINVAL;
2234 goto out;
2235 }
2236
2237 hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
2238 if (!hyp_pgd) {
2239 kvm_err("Hyp mode PGD not allocated\n");
2240 err = -ENOMEM;
2241 goto out;
2242 }
2243
2244 if (__kvm_cpu_uses_extended_idmap()) {
2245 boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
2246 hyp_pgd_order);
2247 if (!boot_hyp_pgd) {
2248 kvm_err("Hyp boot PGD not allocated\n");
2249 err = -ENOMEM;
2250 goto out;
2251 }
2252
2253 err = kvm_map_idmap_text(boot_hyp_pgd);
2254 if (err)
2255 goto out;
2256
2257 merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
2258 if (!merged_hyp_pgd) {
2259 kvm_err("Failed to allocate extra HYP pgd\n");
2260 goto out;
2261 }
2262 __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
2263 hyp_idmap_start);
2264 } else {
2265 err = kvm_map_idmap_text(hyp_pgd);
2266 if (err)
2267 goto out;
2268 }
2269
2270 io_map_base = hyp_idmap_start;
2271 return 0;
2272 out:
2273 free_hyp_pgds();
2274 return err;
2275 }
2276
kvm_arch_commit_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region * mem,const struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)2277 void kvm_arch_commit_memory_region(struct kvm *kvm,
2278 const struct kvm_userspace_memory_region *mem,
2279 const struct kvm_memory_slot *old,
2280 const struct kvm_memory_slot *new,
2281 enum kvm_mr_change change)
2282 {
2283 /*
2284 * At this point memslot has been committed and there is an
2285 * allocated dirty_bitmap[], dirty pages will be be tracked while the
2286 * memory slot is write protected.
2287 */
2288 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
2289 kvm_mmu_wp_memory_region(kvm, mem->slot);
2290 }
2291
kvm_arch_prepare_memory_region(struct kvm * kvm,struct kvm_memory_slot * memslot,const struct kvm_userspace_memory_region * mem,enum kvm_mr_change change)2292 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2293 struct kvm_memory_slot *memslot,
2294 const struct kvm_userspace_memory_region *mem,
2295 enum kvm_mr_change change)
2296 {
2297 hva_t hva = mem->userspace_addr;
2298 hva_t reg_end = hva + mem->memory_size;
2299 bool writable = !(mem->flags & KVM_MEM_READONLY);
2300 int ret = 0;
2301
2302 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2303 change != KVM_MR_FLAGS_ONLY)
2304 return 0;
2305
2306 /*
2307 * Prevent userspace from creating a memory region outside of the IPA
2308 * space addressable by the KVM guest IPA space.
2309 */
2310 if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
2311 return -EFAULT;
2312
2313 down_read(¤t->mm->mmap_sem);
2314 /*
2315 * A memory region could potentially cover multiple VMAs, and any holes
2316 * between them, so iterate over all of them to find out if we can map
2317 * any of them right now.
2318 *
2319 * +--------------------------------------------+
2320 * +---------------+----------------+ +----------------+
2321 * | : VMA 1 | VMA 2 | | VMA 3 : |
2322 * +---------------+----------------+ +----------------+
2323 * | memory region |
2324 * +--------------------------------------------+
2325 */
2326 do {
2327 struct vm_area_struct *vma = find_vma(current->mm, hva);
2328 hva_t vm_start, vm_end;
2329
2330 if (!vma || vma->vm_start >= reg_end)
2331 break;
2332
2333 /*
2334 * Mapping a read-only VMA is only allowed if the
2335 * memory region is configured as read-only.
2336 */
2337 if (writable && !(vma->vm_flags & VM_WRITE)) {
2338 ret = -EPERM;
2339 break;
2340 }
2341
2342 /*
2343 * Take the intersection of this VMA with the memory region
2344 */
2345 vm_start = max(hva, vma->vm_start);
2346 vm_end = min(reg_end, vma->vm_end);
2347
2348 if (vma->vm_flags & VM_PFNMAP) {
2349 gpa_t gpa = mem->guest_phys_addr +
2350 (vm_start - mem->userspace_addr);
2351 phys_addr_t pa;
2352
2353 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2354 pa += vm_start - vma->vm_start;
2355
2356 /* IO region dirty page logging not allowed */
2357 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2358 ret = -EINVAL;
2359 goto out;
2360 }
2361
2362 ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2363 vm_end - vm_start,
2364 writable);
2365 if (ret)
2366 break;
2367 }
2368 hva = vm_end;
2369 } while (hva < reg_end);
2370
2371 if (change == KVM_MR_FLAGS_ONLY)
2372 goto out;
2373
2374 spin_lock(&kvm->mmu_lock);
2375 if (ret)
2376 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
2377 else
2378 stage2_flush_memslot(kvm, memslot);
2379 spin_unlock(&kvm->mmu_lock);
2380 out:
2381 up_read(¤t->mm->mmap_sem);
2382 return ret;
2383 }
2384
kvm_arch_free_memslot(struct kvm * kvm,struct kvm_memory_slot * free,struct kvm_memory_slot * dont)2385 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
2386 struct kvm_memory_slot *dont)
2387 {
2388 }
2389
kvm_arch_create_memslot(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned long npages)2390 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
2391 unsigned long npages)
2392 {
2393 return 0;
2394 }
2395
kvm_arch_memslots_updated(struct kvm * kvm,u64 gen)2396 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2397 {
2398 }
2399
kvm_arch_flush_shadow_all(struct kvm * kvm)2400 void kvm_arch_flush_shadow_all(struct kvm *kvm)
2401 {
2402 kvm_free_stage2_pgd(kvm);
2403 }
2404
kvm_arch_flush_shadow_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)2405 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2406 struct kvm_memory_slot *slot)
2407 {
2408 gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2409 phys_addr_t size = slot->npages << PAGE_SHIFT;
2410
2411 spin_lock(&kvm->mmu_lock);
2412 unmap_stage2_range(kvm, gpa, size);
2413 spin_unlock(&kvm->mmu_lock);
2414 }
2415
2416 /*
2417 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2418 *
2419 * Main problems:
2420 * - S/W ops are local to a CPU (not broadcast)
2421 * - We have line migration behind our back (speculation)
2422 * - System caches don't support S/W at all (damn!)
2423 *
2424 * In the face of the above, the best we can do is to try and convert
2425 * S/W ops to VA ops. Because the guest is not allowed to infer the
2426 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2427 * which is a rather good thing for us.
2428 *
2429 * Also, it is only used when turning caches on/off ("The expected
2430 * usage of the cache maintenance instructions that operate by set/way
2431 * is associated with the cache maintenance instructions associated
2432 * with the powerdown and powerup of caches, if this is required by
2433 * the implementation.").
2434 *
2435 * We use the following policy:
2436 *
2437 * - If we trap a S/W operation, we enable VM trapping to detect
2438 * caches being turned on/off, and do a full clean.
2439 *
2440 * - We flush the caches on both caches being turned on and off.
2441 *
2442 * - Once the caches are enabled, we stop trapping VM ops.
2443 */
kvm_set_way_flush(struct kvm_vcpu * vcpu)2444 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2445 {
2446 unsigned long hcr = *vcpu_hcr(vcpu);
2447
2448 /*
2449 * If this is the first time we do a S/W operation
2450 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2451 * VM trapping.
2452 *
2453 * Otherwise, rely on the VM trapping to wait for the MMU +
2454 * Caches to be turned off. At that point, we'll be able to
2455 * clean the caches again.
2456 */
2457 if (!(hcr & HCR_TVM)) {
2458 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2459 vcpu_has_cache_enabled(vcpu));
2460 stage2_flush_vm(vcpu->kvm);
2461 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
2462 }
2463 }
2464
kvm_toggle_cache(struct kvm_vcpu * vcpu,bool was_enabled)2465 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2466 {
2467 bool now_enabled = vcpu_has_cache_enabled(vcpu);
2468
2469 /*
2470 * If switching the MMU+caches on, need to invalidate the caches.
2471 * If switching it off, need to clean the caches.
2472 * Clean + invalidate does the trick always.
2473 */
2474 if (now_enabled != was_enabled)
2475 stage2_flush_vm(vcpu->kvm);
2476
2477 /* Caches are now on, stop trapping VM ops (until a S/W op) */
2478 if (now_enabled)
2479 *vcpu_hcr(vcpu) &= ~HCR_TVM;
2480
2481 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2482 }
2483