1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
4 * No bombay mix was harmed in the writing of this file.
5 *
6 * Copyright (C) 2020 Google LLC
7 * Author: Will Deacon <will@kernel.org>
8 */
9
10 #include <linux/bitfield.h>
11 #include <asm/kvm_pgtable.h>
12 #include <asm/stage2_pgtable.h>
13
14
15 struct kvm_pgtable_walk_data {
16 struct kvm_pgtable_walker *walker;
17
18 const u64 start;
19 u64 addr;
20 const u64 end;
21 };
22
kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx * ctx)23 static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx)
24 {
25 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI);
26 }
27
kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx * ctx)28 static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
29 {
30 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
31 }
32
kvm_phys_is_valid(u64 phys)33 static bool kvm_phys_is_valid(u64 phys)
34 {
35 u64 parange_max = kvm_get_parange_max();
36 u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max);
37
38 return phys < BIT(shift);
39 }
40
kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx * ctx,u64 phys)41 static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
42 {
43 u64 granule = kvm_granule_size(ctx->level);
44
45 if (!kvm_level_supports_block_mapping(ctx->level))
46 return false;
47
48 if (granule > (ctx->end - ctx->addr))
49 return false;
50
51 if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
52 return false;
53
54 return IS_ALIGNED(ctx->addr, granule);
55 }
56
kvm_pgtable_idx(struct kvm_pgtable_walk_data * data,s8 level)57 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level)
58 {
59 u64 shift = kvm_granule_shift(level);
60 u64 mask = BIT(PAGE_SHIFT - 3) - 1;
61
62 return (data->addr >> shift) & mask;
63 }
64
kvm_pgd_page_idx(struct kvm_pgtable * pgt,u64 addr)65 static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
66 {
67 u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
68 u64 mask = BIT(pgt->ia_bits) - 1;
69
70 return (addr & mask) >> shift;
71 }
72
kvm_pgd_pages(u32 ia_bits,s8 start_level)73 static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level)
74 {
75 struct kvm_pgtable pgt = {
76 .ia_bits = ia_bits,
77 .start_level = start_level,
78 };
79
80 return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
81 }
82
kvm_clear_pte(kvm_pte_t * ptep)83 static void kvm_clear_pte(kvm_pte_t *ptep)
84 {
85 WRITE_ONCE(*ptep, 0);
86 }
87
kvm_init_table_pte(kvm_pte_t * childp,struct kvm_pgtable_mm_ops * mm_ops)88 static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops)
89 {
90 kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
91
92 pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
93 pte |= KVM_PTE_VALID;
94 return pte;
95 }
96
kvm_init_valid_leaf_pte(u64 pa,kvm_pte_t attr,s8 level)97 static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level)
98 {
99 kvm_pte_t pte = kvm_phys_to_pte(pa);
100 u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE :
101 KVM_PTE_TYPE_BLOCK;
102
103 pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
104 pte |= FIELD_PREP(KVM_PTE_TYPE, type);
105 pte |= KVM_PTE_VALID;
106
107 return pte;
108 }
109
kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data * data,const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)110 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
111 const struct kvm_pgtable_visit_ctx *ctx,
112 enum kvm_pgtable_walk_flags visit)
113 {
114 struct kvm_pgtable_walker *walker = data->walker;
115
116 /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */
117 WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held());
118 return walker->cb(ctx, visit);
119 }
120
kvm_pgtable_walk_continue(const struct kvm_pgtable_walker * walker,int r)121 static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker,
122 int r)
123 {
124 /*
125 * Visitor callbacks return EAGAIN when the conditions that led to a
126 * fault are no longer reflected in the page tables due to a race to
127 * update a PTE. In the context of a fault handler this is interpreted
128 * as a signal to retry guest execution.
129 *
130 * Ignore the return code altogether for walkers outside a fault handler
131 * (e.g. write protecting a range of memory) and chug along with the
132 * page table walk.
133 */
134 if (r == -EAGAIN)
135 return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT);
136
137 return !r;
138 }
139
140 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
141 struct kvm_pgtable_mm_ops *mm_ops,
142 struct kvm_pgtable_pte_ops *pte_ops,
143 kvm_pteref_t pgtable, s8 level);
144
__kvm_pgtable_visit(struct kvm_pgtable_walk_data * data,struct kvm_pgtable_mm_ops * mm_ops,struct kvm_pgtable_pte_ops * pte_ops,kvm_pteref_t pteref,s8 level)145 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
146 struct kvm_pgtable_mm_ops *mm_ops,
147 struct kvm_pgtable_pte_ops *pte_ops,
148 kvm_pteref_t pteref, s8 level)
149 {
150 enum kvm_pgtable_walk_flags flags = data->walker->flags;
151 kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref);
152 struct kvm_pgtable_visit_ctx ctx = {
153 .ptep = ptep,
154 .old = READ_ONCE(*ptep),
155 .arg = data->walker->arg,
156 .mm_ops = mm_ops,
157 .start = data->start,
158 .pte_ops = pte_ops,
159 .addr = data->addr,
160 .end = data->end,
161 .level = level,
162 .flags = flags,
163 };
164 int ret = 0;
165 bool reload = false;
166 kvm_pteref_t childp;
167 bool table = kvm_pte_table(ctx.old, level);
168
169 if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
170 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE);
171 reload = true;
172 }
173
174 if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) {
175 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF);
176 reload = true;
177 }
178
179 /*
180 * Reload the page table after invoking the walker callback for leaf
181 * entries or after pre-order traversal, to allow the walker to descend
182 * into a newly installed or replaced table.
183 */
184 if (reload) {
185 ctx.old = READ_ONCE(*ptep);
186 table = kvm_pte_table(ctx.old, level);
187 }
188
189 if (!kvm_pgtable_walk_continue(data->walker, ret))
190 goto out;
191
192 if (!table) {
193 data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
194 data->addr += kvm_granule_size(level);
195 goto out;
196 }
197
198 childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops);
199 ret = __kvm_pgtable_walk(data, mm_ops, pte_ops, childp, level + 1);
200 if (!kvm_pgtable_walk_continue(data->walker, ret))
201 goto out;
202
203 if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST)
204 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST);
205
206 out:
207 if (kvm_pgtable_walk_continue(data->walker, ret))
208 return 0;
209
210 return ret;
211 }
212
__kvm_pgtable_walk(struct kvm_pgtable_walk_data * data,struct kvm_pgtable_mm_ops * mm_ops,struct kvm_pgtable_pte_ops * pte_ops,kvm_pteref_t pgtable,s8 level)213 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
214 struct kvm_pgtable_mm_ops *mm_ops,
215 struct kvm_pgtable_pte_ops *pte_ops,
216 kvm_pteref_t pgtable, s8 level)
217 {
218 u32 idx;
219 int ret = 0;
220
221 if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL ||
222 level > KVM_PGTABLE_LAST_LEVEL))
223 return -EINVAL;
224
225 for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
226 kvm_pteref_t pteref = &pgtable[idx];
227
228 if (data->addr >= data->end)
229 break;
230
231 ret = __kvm_pgtable_visit(data, mm_ops, pte_ops, pteref, level);
232 if (ret)
233 break;
234 }
235
236 return ret;
237 }
238
_kvm_pgtable_walk(struct kvm_pgtable * pgt,struct kvm_pgtable_walk_data * data)239 static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data)
240 {
241 u32 idx;
242 int ret = 0;
243 u64 limit = BIT(pgt->ia_bits);
244
245 if (data->addr > limit || data->end > limit)
246 return -ERANGE;
247
248 if (!pgt->pgd)
249 return -EINVAL;
250
251 for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) {
252 kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE];
253
254 ret = __kvm_pgtable_walk(data, pgt->mm_ops, pgt->pte_ops,
255 pteref, pgt->start_level);
256 if (ret)
257 break;
258 }
259
260 return ret;
261 }
262
kvm_pgtable_walk(struct kvm_pgtable * pgt,u64 addr,u64 size,struct kvm_pgtable_walker * walker)263 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
264 struct kvm_pgtable_walker *walker)
265 {
266 struct kvm_pgtable_walk_data walk_data = {
267 .start = ALIGN_DOWN(addr, PAGE_SIZE),
268 .addr = ALIGN_DOWN(addr, PAGE_SIZE),
269 .end = PAGE_ALIGN(walk_data.addr + size),
270 .walker = walker,
271 };
272 int r;
273
274 r = kvm_pgtable_walk_begin(walker);
275 if (r)
276 return r;
277
278 r = _kvm_pgtable_walk(pgt, &walk_data);
279 kvm_pgtable_walk_end(walker);
280
281 return r;
282 }
283
284 struct leaf_walk_data {
285 kvm_pte_t pte;
286 s8 level;
287 };
288
leaf_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)289 static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
290 enum kvm_pgtable_walk_flags visit)
291 {
292 struct leaf_walk_data *data = ctx->arg;
293
294 data->pte = ctx->old;
295 data->level = ctx->level;
296
297 return 0;
298 }
299
kvm_pgtable_get_leaf(struct kvm_pgtable * pgt,u64 addr,kvm_pte_t * ptep,s8 * level)300 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
301 kvm_pte_t *ptep, s8 *level)
302 {
303 struct leaf_walk_data data;
304 struct kvm_pgtable_walker walker = {
305 .cb = leaf_walker,
306 .flags = KVM_PGTABLE_WALK_LEAF,
307 .arg = &data,
308 };
309 int ret;
310
311 ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
312 PAGE_SIZE, &walker);
313 if (!ret) {
314 if (ptep)
315 *ptep = data.pte;
316 if (level)
317 *level = data.level;
318 }
319
320 return ret;
321 }
322
323 struct hyp_map_data {
324 const u64 phys;
325 kvm_pte_t attr;
326 };
327
hyp_set_prot_attr(enum kvm_pgtable_prot prot,kvm_pte_t * ptep)328 static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
329 {
330 u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
331 KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
332 u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
333 kvm_pte_t attr;
334 u32 mtype;
335
336 if (!(prot & KVM_PGTABLE_PROT_R) ||
337 (prot & (KVM_PGTABLE_PROT_PXN | KVM_PGTABLE_PROT_UXN)))
338 return -EINVAL;
339
340 switch (prot & (KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC)) {
341 case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
342 return -EINVAL;
343 case KVM_PGTABLE_PROT_DEVICE:
344 mtype = MT_DEVICE_nGnRE;
345 break;
346 case KVM_PGTABLE_PROT_NORMAL_NC:
347 mtype = MT_NORMAL_NC;
348 break;
349 default:
350 mtype = MT_NORMAL;
351 }
352 attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
353
354 if (prot & KVM_PGTABLE_PROT_X) {
355 if (prot & KVM_PGTABLE_PROT_W)
356 return -EINVAL;
357
358 if (mtype != MT_NORMAL)
359 return -EINVAL;
360
361 if (system_supports_bti_kernel())
362 attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP;
363 } else {
364 attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
365 }
366
367 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
368 if (!kvm_lpa2_is_enabled())
369 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
370 attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
371 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
372 *ptep = attr;
373
374 return 0;
375 }
376
kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)377 enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
378 {
379 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
380 u32 ap;
381
382 if (!kvm_pte_valid(pte))
383 return prot;
384
385 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
386 prot |= KVM_PGTABLE_PROT_X;
387
388 ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
389 if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
390 prot |= KVM_PGTABLE_PROT_R;
391 else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
392 prot |= KVM_PGTABLE_PROT_RW;
393
394 return prot;
395 }
396
hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx * ctx,struct hyp_map_data * data)397 static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
398 struct hyp_map_data *data)
399 {
400 u64 phys = data->phys + (ctx->addr - ctx->start);
401 kvm_pte_t new;
402
403 if (!kvm_block_mapping_supported(ctx, phys))
404 return false;
405
406 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
407 if (ctx->old == new)
408 return true;
409 if (!kvm_pte_valid(ctx->old))
410 ctx->mm_ops->get_page(ctx->ptep);
411 else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
412 return false;
413
414 smp_store_release(ctx->ptep, new);
415 return true;
416 }
417
hyp_map_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)418 static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
419 enum kvm_pgtable_walk_flags visit)
420 {
421 kvm_pte_t *childp, new;
422 struct hyp_map_data *data = ctx->arg;
423 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
424
425 if (hyp_map_walker_try_leaf(ctx, data))
426 return 0;
427
428 if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
429 return -EINVAL;
430
431 childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
432 if (!childp)
433 return -ENOMEM;
434
435 new = kvm_init_table_pte(childp, mm_ops);
436 mm_ops->get_page(ctx->ptep);
437 smp_store_release(ctx->ptep, new);
438
439 return 0;
440 }
441
kvm_pgtable_hyp_map(struct kvm_pgtable * pgt,u64 addr,u64 size,u64 phys,enum kvm_pgtable_prot prot)442 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
443 enum kvm_pgtable_prot prot)
444 {
445 int ret;
446 struct hyp_map_data map_data = {
447 .phys = ALIGN_DOWN(phys, PAGE_SIZE),
448 };
449 struct kvm_pgtable_walker walker = {
450 .cb = hyp_map_walker,
451 .flags = KVM_PGTABLE_WALK_LEAF,
452 .arg = &map_data,
453 };
454
455 ret = hyp_set_prot_attr(prot, &map_data.attr);
456 if (ret)
457 return ret;
458
459 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
460 dsb(ishst);
461 isb();
462 return ret;
463 }
464
hyp_unmap_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)465 static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
466 enum kvm_pgtable_walk_flags visit)
467 {
468 kvm_pte_t *childp = NULL;
469 u64 granule = kvm_granule_size(ctx->level);
470 u64 *unmapped = ctx->arg;
471 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
472
473 if (!kvm_pte_valid(ctx->old))
474 return -EINVAL;
475
476 if (kvm_pte_table(ctx->old, ctx->level)) {
477 childp = kvm_pte_follow(ctx->old, mm_ops);
478
479 if (mm_ops->page_count(childp) != 1)
480 return 0;
481
482 kvm_clear_pte(ctx->ptep);
483 dsb(ishst);
484 __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN);
485 } else {
486 if (ctx->end - ctx->addr < granule)
487 return -EINVAL;
488
489 kvm_clear_pte(ctx->ptep);
490 dsb(ishst);
491 __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
492 *unmapped += granule;
493 }
494
495 dsb(ish);
496 isb();
497 mm_ops->put_page(ctx->ptep);
498
499 if (childp)
500 mm_ops->put_page(childp);
501
502 return 0;
503 }
504
kvm_pgtable_hyp_unmap(struct kvm_pgtable * pgt,u64 addr,u64 size)505 u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
506 {
507 u64 unmapped = 0;
508 struct kvm_pgtable_walker walker = {
509 .cb = hyp_unmap_walker,
510 .arg = &unmapped,
511 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
512 };
513
514 if (!pgt->mm_ops->page_count)
515 return 0;
516
517 kvm_pgtable_walk(pgt, addr, size, &walker);
518 return unmapped;
519 }
520
kvm_pgtable_hyp_init(struct kvm_pgtable * pgt,u32 va_bits,struct kvm_pgtable_mm_ops * mm_ops)521 int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
522 struct kvm_pgtable_mm_ops *mm_ops)
523 {
524 s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 -
525 ARM64_HW_PGTABLE_LEVELS(va_bits);
526
527 if (start_level < KVM_PGTABLE_FIRST_LEVEL ||
528 start_level > KVM_PGTABLE_LAST_LEVEL)
529 return -EINVAL;
530
531 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL);
532 if (!pgt->pgd)
533 return -ENOMEM;
534
535 pgt->ia_bits = va_bits;
536 pgt->start_level = start_level;
537 pgt->mm_ops = mm_ops;
538 pgt->mmu = NULL;
539 pgt->pte_ops = NULL;
540
541 return 0;
542 }
543
hyp_free_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)544 static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
545 enum kvm_pgtable_walk_flags visit)
546 {
547 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
548
549 if (!kvm_pte_valid(ctx->old))
550 return 0;
551
552 mm_ops->put_page(ctx->ptep);
553
554 if (kvm_pte_table(ctx->old, ctx->level))
555 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
556
557 return 0;
558 }
559
kvm_pgtable_hyp_destroy(struct kvm_pgtable * pgt)560 void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
561 {
562 struct kvm_pgtable_walker walker = {
563 .cb = hyp_free_walker,
564 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
565 };
566
567 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
568 pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd));
569 pgt->pgd = NULL;
570 }
571
572 struct stage2_map_data {
573 const u64 phys;
574 kvm_pte_t attr;
575 u64 annotation;
576
577 kvm_pte_t *anchor;
578 kvm_pte_t *childp;
579
580 struct kvm_s2_mmu *mmu;
581 void *memcache;
582
583 /* Force mappings to page granularity */
584 bool force_pte;
585 };
586
kvm_get_vtcr(u64 mmfr0,u64 mmfr1,u32 phys_shift)587 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
588 {
589 u64 vtcr = VTCR_EL2_FLAGS;
590 s8 lvls;
591
592 vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
593 vtcr |= VTCR_EL2_T0SZ(phys_shift);
594 /*
595 * Use a minimum 2 level page table to prevent splitting
596 * host PMD huge pages at stage2.
597 */
598 lvls = stage2_pgtable_levels(phys_shift);
599 if (lvls < 2)
600 lvls = 2;
601
602 /*
603 * When LPA2 is enabled, the HW supports an extra level of translation
604 * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2
605 * to as an addition to SL0 to enable encoding this extra start level.
606 * However, since we always use concatenated pages for the first level
607 * lookup, we will never need this extra level and therefore do not need
608 * to touch SL2.
609 */
610 vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
611
612 #ifdef CONFIG_ARM64_HW_AFDBM
613 /*
614 * Enable the Hardware Access Flag management, unconditionally
615 * on all CPUs. In systems that have asymmetric support for the feature
616 * this allows KVM to leverage hardware support on the subset of cores
617 * that implement the feature.
618 *
619 * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by
620 * hardware) on implementations that do not advertise support for the
621 * feature. As such, setting HA unconditionally is safe, unless you
622 * happen to be running on a design that has unadvertised support for
623 * HAFDBS. Here be dragons.
624 */
625 if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
626 vtcr |= VTCR_EL2_HA;
627 #endif /* CONFIG_ARM64_HW_AFDBM */
628
629 if (kvm_lpa2_is_enabled())
630 vtcr |= VTCR_EL2_DS;
631
632 /* Set the vmid bits */
633 vtcr |= (get_vmid_bits(mmfr1) == 16) ?
634 VTCR_EL2_VS_16BIT :
635 VTCR_EL2_VS_8BIT;
636
637 return vtcr;
638 }
639
stage2_has_fwb(struct kvm_pgtable * pgt)640 static bool stage2_has_fwb(struct kvm_pgtable *pgt)
641 {
642 if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
643 return false;
644
645 return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
646 }
647
kvm_tlb_flush_vmid_range(struct kvm_s2_mmu * mmu,phys_addr_t addr,size_t size)648 void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
649 phys_addr_t addr, size_t size)
650 {
651 unsigned long pages, inval_pages;
652
653 if (!system_supports_tlb_range()) {
654 kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
655 return;
656 }
657
658 pages = size >> PAGE_SHIFT;
659 while (pages > 0) {
660 inval_pages = min(pages, MAX_TLBI_RANGE_PAGES);
661 kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages);
662
663 addr += inval_pages << PAGE_SHIFT;
664 pages -= inval_pages;
665 }
666 }
667
668 #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
669
stage2_set_prot_attr(struct kvm_pgtable * pgt,enum kvm_pgtable_prot prot,kvm_pte_t * ptep)670 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
671 kvm_pte_t *ptep)
672 {
673 kvm_pte_t attr;
674 u64 exec_type = KVM_PTE_LEAF_ATTR_HI_S2_XN_XN;
675 u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
676 enum kvm_pgtable_prot exec_prot;
677
678 switch (prot & (KVM_PGTABLE_PROT_DEVICE |
679 KVM_PGTABLE_PROT_NORMAL_NC)) {
680 case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
681 return -EINVAL;
682 case KVM_PGTABLE_PROT_DEVICE:
683 if (prot & KVM_PGTABLE_PROT_X)
684 return -EINVAL;
685 attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE);
686 break;
687 case KVM_PGTABLE_PROT_NORMAL_NC:
688 if (prot & KVM_PGTABLE_PROT_X)
689 return -EINVAL;
690 attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
691 break;
692 default:
693 attr = KVM_S2_MEMATTR(pgt, NORMAL);
694 }
695
696 exec_prot = prot & (KVM_PGTABLE_PROT_X | KVM_PGTABLE_PROT_PXN | KVM_PGTABLE_PROT_UXN);
697 switch (exec_prot) {
698 case KVM_PGTABLE_PROT_X:
699 goto set_ap;
700 case KVM_PGTABLE_PROT_PXN:
701 exec_type = KVM_PTE_LEAF_ATTR_HI_S2_XN_PXN;
702 break;
703 case KVM_PGTABLE_PROT_UXN:
704 exec_type = KVM_PTE_LEAF_ATTR_HI_S2_XN_UXN;
705 break;
706 default:
707 if (exec_prot)
708 return -EINVAL;
709 }
710 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, exec_type);
711
712 set_ap:
713 if (prot & KVM_PGTABLE_PROT_R)
714 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
715
716 if (prot & KVM_PGTABLE_PROT_W)
717 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
718
719 if (!kvm_lpa2_is_enabled())
720 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
721
722 attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
723 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
724 *ptep = attr;
725
726 return 0;
727 }
728
kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)729 enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
730 {
731 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
732
733 if (!kvm_pte_valid(pte))
734 return prot;
735
736 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
737 prot |= KVM_PGTABLE_PROT_R;
738 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
739 prot |= KVM_PGTABLE_PROT_W;
740
741 switch (FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte)) {
742 case 0:
743 prot |= KVM_PGTABLE_PROT_X;
744 break;
745 case KVM_PTE_LEAF_ATTR_HI_S2_XN_PXN:
746 prot |= KVM_PGTABLE_PROT_PXN;
747 break;
748 case KVM_PTE_LEAF_ATTR_HI_S2_XN_UXN:
749 prot |= KVM_PGTABLE_PROT_UXN;
750 break;
751 case KVM_PTE_LEAF_ATTR_HI_S2_XN_XN:
752 break;
753 default:
754 WARN_ON(1);
755 }
756
757 return prot;
758 }
759
stage2_pte_needs_update(struct kvm_pgtable * pgt,kvm_pte_t old,kvm_pte_t new)760 static bool stage2_pte_needs_update(struct kvm_pgtable *pgt,
761 kvm_pte_t old, kvm_pte_t new)
762 {
763 /* Following filter logic applies only to guest stage-2 entries. */
764 if (pgt->flags & KVM_PGTABLE_S2_IDMAP)
765 return true;
766
767 if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
768 return true;
769
770 return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
771 }
772
stage2_pte_is_locked(kvm_pte_t pte)773 static bool stage2_pte_is_locked(kvm_pte_t pte)
774 {
775 return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
776 }
777
stage2_try_set_pte(const struct kvm_pgtable_visit_ctx * ctx,kvm_pte_t new)778 static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
779 {
780 if (!kvm_pgtable_walk_shared(ctx)) {
781 WRITE_ONCE(*ctx->ptep, new);
782 return true;
783 }
784
785 return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
786 }
787
788 /**
789 * stage2_try_break_pte() - Invalidates a pte according to the
790 * 'break-before-make' requirements of the
791 * architecture.
792 *
793 * @ctx: context of the visited pte.
794 * @mmu: stage-2 mmu
795 *
796 * Returns: true if the pte was successfully broken.
797 *
798 * If the removed pte was valid, performs the necessary serialization and TLB
799 * invalidation for the old value. For counted ptes, drops the reference count
800 * on the containing table page.
801 */
stage2_try_break_pte(const struct kvm_pgtable_visit_ctx * ctx,struct kvm_s2_mmu * mmu)802 static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
803 struct kvm_s2_mmu *mmu)
804 {
805 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
806 struct kvm_pgtable_pte_ops *pte_ops = ctx->pte_ops;
807
808 if (stage2_pte_is_locked(ctx->old)) {
809 /*
810 * Should never occur if this walker has exclusive access to the
811 * page tables.
812 */
813 WARN_ON(!kvm_pgtable_walk_shared(ctx));
814 return false;
815 }
816
817 if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
818 return false;
819
820 if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
821 /*
822 * Perform the appropriate TLB invalidation based on the
823 * evicted pte value (if any).
824 */
825 if (kvm_pte_table(ctx->old, ctx->level)) {
826 u64 size = kvm_granule_size(ctx->level);
827 u64 addr = ALIGN_DOWN(ctx->addr, size);
828
829 kvm_tlb_flush_vmid_range(mmu, addr, size);
830 } else if (kvm_pte_valid(ctx->old)) {
831 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
832 ctx->addr, ctx->level);
833 }
834 }
835
836 if (pte_ops->pte_is_counted_cb(ctx->old, ctx->level))
837 mm_ops->put_page(ctx->ptep);
838
839 return true;
840 }
841
stage2_make_pte(const struct kvm_pgtable_visit_ctx * ctx,kvm_pte_t new)842 static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx,
843 kvm_pte_t new)
844 {
845 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
846 struct kvm_pgtable_pte_ops *pte_ops = ctx->pte_ops;
847
848 WARN_ON(!stage2_pte_is_locked(*ctx->ptep));
849
850 if (pte_ops->pte_is_counted_cb(new, ctx->level))
851 mm_ops->get_page(ctx->ptep);
852
853 smp_store_release(ctx->ptep, new);
854 }
855
stage2_unmap_defer_tlb_flush(struct kvm_pgtable * pgt)856 static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
857 {
858 /*
859 * If FEAT_TLBIRANGE is implemented, defer the individual
860 * TLB invalidations until the entire walk is finished, and
861 * then use the range-based TLBI instructions to do the
862 * invalidations. Condition deferred TLB invalidation on the
863 * system supporting FWB as the optimization is entirely
864 * pointless when the unmap walker needs to perform CMOs.
865 */
866 return system_supports_tlb_range() && stage2_has_fwb(pgt);
867 }
868
stage2_unmap_clear_pte(const struct kvm_pgtable_visit_ctx * ctx,struct kvm_s2_mmu * mmu)869 static void stage2_unmap_clear_pte(const struct kvm_pgtable_visit_ctx *ctx,
870 struct kvm_s2_mmu *mmu)
871 {
872 struct kvm_pgtable *pgt = ctx->arg;
873 if (kvm_pte_valid(ctx->old)) {
874 kvm_clear_pte(ctx->ptep);
875
876 if (kvm_pte_table(ctx->old, ctx->level)) {
877 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
878 TLBI_TTL_UNKNOWN);
879 } else if (!stage2_unmap_defer_tlb_flush(pgt)) {
880 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
881 ctx->level);
882 }
883 }
884 }
885
stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx * ctx,struct kvm_s2_mmu * mmu,struct kvm_pgtable_mm_ops * mm_ops)886 static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
887 struct kvm_s2_mmu *mmu,
888 struct kvm_pgtable_mm_ops *mm_ops)
889 {
890 /*
891 * Clear the existing PTE, and perform break-before-make if it was
892 * valid. Depending on the system support, defer the TLB maintenance
893 * for the same until the entire unmap walk is completed.
894 */
895 stage2_unmap_clear_pte(ctx, mmu);
896 mm_ops->put_page(ctx->ptep);
897 }
898
stage2_pte_cacheable(struct kvm_pgtable * pgt,kvm_pte_t pte)899 static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
900 {
901 u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
902 return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL);
903 }
904
stage2_pte_executable(kvm_pte_t pte)905 static bool stage2_pte_executable(kvm_pte_t pte)
906 {
907 kvm_pte_t xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte);
908
909 return kvm_pte_valid(pte) && xn != KVM_PTE_LEAF_ATTR_HI_S2_XN_XN;
910 }
911
stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx * ctx,const struct stage2_map_data * data)912 static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
913 const struct stage2_map_data *data)
914 {
915 u64 phys = data->phys;
916
917 /*
918 * Stage-2 walks to update ownership data are communicated to the map
919 * walker using an invalid PA. Avoid offsetting an already invalid PA,
920 * which could overflow and make the address valid again.
921 */
922 if (!kvm_phys_is_valid(phys))
923 return phys;
924
925 /*
926 * Otherwise, work out the correct PA based on how far the walk has
927 * gotten.
928 */
929 return phys + (ctx->addr - ctx->start);
930 }
931
stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx * ctx,struct stage2_map_data * data)932 static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
933 struct stage2_map_data *data)
934 {
935 u64 phys = stage2_map_walker_phys_addr(ctx, data);
936
937 if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL)
938 return false;
939
940 return kvm_block_mapping_supported(ctx, phys);
941 }
942
stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx * ctx,struct stage2_map_data * data)943 static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
944 struct stage2_map_data *data)
945 {
946 kvm_pte_t new;
947 u64 phys = stage2_map_walker_phys_addr(ctx, data);
948 u64 granule = kvm_granule_size(ctx->level);
949 struct kvm_pgtable *pgt = data->mmu->pgt;
950 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
951 struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops;
952 bool old_is_counted;
953
954 if (!stage2_leaf_mapping_allowed(ctx, data))
955 return -E2BIG;
956
957 if (kvm_phys_is_valid(phys))
958 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
959 else
960 new = data->annotation;
961
962 old_is_counted = pte_ops->pte_is_counted_cb(ctx->old, ctx->level);
963 if (old_is_counted) {
964 /*
965 * Skip updating a guest PTE if we are trying to recreate the
966 * exact same mapping or change only the access permissions.
967 * Instead, the vCPU will exit one more time from the guest if
968 * still needed and then go through the path of relaxing
969 * permissions. This applies only to guest PTEs; Host PTEs
970 * are unconditionally updated. The host cannot livelock
971 * because the abort handler has done prior checks before
972 * calling here.
973 */
974 if (!stage2_pte_needs_update(pgt, ctx->old, new))
975 return -EAGAIN;
976 }
977
978 /* If we're only changing software bits, then store them and go! */
979 if (!kvm_pgtable_walk_shared(ctx) &&
980 !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) {
981 if (old_is_counted != pte_ops->pte_is_counted_cb(new, ctx->level)) {
982 if (old_is_counted)
983 mm_ops->put_page(ctx->ptep);
984 else
985 mm_ops->get_page(ctx->ptep);
986 }
987 WARN_ON_ONCE(!stage2_try_set_pte(ctx, new));
988 return 0;
989 }
990
991 if (!stage2_try_break_pte(ctx, data->mmu))
992 return -EAGAIN;
993
994 /* Perform CMOs before installation of the guest stage-2 PTE */
995 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
996 stage2_pte_cacheable(pgt, new))
997 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
998 granule);
999
1000 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
1001 stage2_pte_executable(new))
1002 mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
1003
1004 stage2_make_pte(ctx, new);
1005
1006 return 0;
1007 }
1008
stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx * ctx,struct stage2_map_data * data)1009 static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
1010 struct stage2_map_data *data)
1011 {
1012 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1013 kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
1014 int ret;
1015
1016 if (!stage2_leaf_mapping_allowed(ctx, data))
1017 return 0;
1018
1019 ret = stage2_map_walker_try_leaf(ctx, data);
1020 if (ret)
1021 return ret;
1022
1023 mm_ops->free_unlinked_table(childp, ctx->level);
1024 return 0;
1025 }
1026
stage2_map_prefault_block(struct kvm_pgtable_pte_ops * pte_ops,const struct kvm_pgtable_visit_ctx * ctx,kvm_pte_t * ptep)1027 static void stage2_map_prefault_block(struct kvm_pgtable_pte_ops *pte_ops,
1028 const struct kvm_pgtable_visit_ctx *ctx,
1029 kvm_pte_t *ptep)
1030 {
1031 kvm_pte_t block_pte = ctx->old;
1032 u64 pa, granule;
1033 bool counted;
1034 int i;
1035
1036 if (!kvm_pte_valid(block_pte))
1037 return;
1038
1039 pa = kvm_pte_to_phys(block_pte);
1040 granule = kvm_granule_size(ctx->level + 1);
1041 counted = pte_ops->pte_is_counted_cb(block_pte, ctx->level + 1);
1042
1043 for (i = 0; i < PTRS_PER_PTE; ++i, ++ptep, pa += granule) {
1044 kvm_pte_t pte = kvm_init_valid_leaf_pte(
1045 pa, block_pte, ctx->level + 1);
1046 /*
1047 * Skip ptes in the range being modified by the caller if we're
1048 * installing last level entries. Otherwise, we need to
1049 * temporarily put in a valid mapping to make sure the
1050 * prefaulting logic is triggered on the next
1051 * stage2_map_walk_leaf(). This adds an unnecessary TLBI as we'll
1052 * presumably re-break the freshly installed block, but that
1053 * should happen very infrequently.
1054 */
1055 if ((ctx->level < (KVM_PGTABLE_LAST_LEVEL - 1)) ||
1056 (pa < ctx->addr) || (pa >= ctx->end)) {
1057 /* We can write non-atomically: ptep isn't yet live. */
1058 *ptep = pte;
1059
1060 if (counted)
1061 ctx->mm_ops->get_page(ptep);
1062 }
1063 }
1064 }
1065
stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx * ctx,struct stage2_map_data * data)1066 static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
1067 struct stage2_map_data *data)
1068 {
1069 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1070 struct kvm_pgtable *pgt = data->mmu->pgt;
1071 struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops;
1072 kvm_pte_t *childp, new;
1073 int ret;
1074
1075 ret = stage2_map_walker_try_leaf(ctx, data);
1076 if (ret != -E2BIG)
1077 return ret;
1078
1079 if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
1080 return -EINVAL;
1081
1082 if (!data->memcache)
1083 return -ENOMEM;
1084
1085 childp = mm_ops->zalloc_page(data->memcache);
1086 if (!childp)
1087 return -ENOMEM;
1088
1089 WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) &&
1090 pte_ops->pte_is_counted_cb(ctx->old, ctx->level));
1091
1092 if (pgt->flags & KVM_PGTABLE_S2_PREFAULT_BLOCK)
1093 stage2_map_prefault_block(pte_ops, ctx, childp);
1094
1095 if (!stage2_try_break_pte(ctx, data->mmu)) {
1096 mm_ops->put_page(childp);
1097 return -EAGAIN;
1098 }
1099
1100 /*
1101 * If we've run into an existing block mapping then replace it with
1102 * a table. Accesses beyond 'end' that fall within the new table
1103 * will be mapped lazily.
1104 */
1105 new = kvm_init_table_pte(childp, mm_ops);
1106 stage2_make_pte(ctx, new);
1107 return 0;
1108 }
1109
debug_check_table_before_coalescing(const struct kvm_pgtable_visit_ctx * ctx,struct stage2_map_data * data,kvm_pte_t * ptep,u64 pa)1110 static void debug_check_table_before_coalescing(
1111 const struct kvm_pgtable_visit_ctx *ctx,
1112 struct stage2_map_data *data,
1113 kvm_pte_t *ptep, u64 pa)
1114 {
1115 #ifdef CONFIG_PKVM_STRICT_CHECKS
1116 u64 granule = kvm_granule_size(ctx->level + 1);
1117 int i;
1118
1119 for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pa += granule) {
1120 kvm_pte_t pte = kvm_init_valid_leaf_pte(
1121 pa, data->attr, ctx->level + 1);
1122 WARN_ON(pte != *ptep);
1123 }
1124 #endif
1125 }
1126
stage2_coalesce_walk_table_post(const struct kvm_pgtable_visit_ctx * ctx,struct stage2_map_data * data)1127 static int stage2_coalesce_walk_table_post(const struct kvm_pgtable_visit_ctx *ctx,
1128 struct stage2_map_data *data)
1129 {
1130 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1131 kvm_pte_t new, *childp = kvm_pte_follow(ctx->old, mm_ops);
1132 u64 size, addr;
1133
1134 /*
1135 * We don't want to coalesce during pkvm initialisation, before the
1136 * overall structure of the host S2 table is created.
1137 */
1138 if (!static_branch_likely(&kvm_protected_mode_initialized))
1139 return 0;
1140
1141 /*
1142 * If we installed a non-refcounted valid mapping, and the table has no
1143 * other raised references, then we can immediately collapse to a block
1144 * mapping.
1145 */
1146 if (!kvm_phys_is_valid(data->phys) ||
1147 !kvm_level_supports_block_mapping(ctx->level) ||
1148 (mm_ops->page_count(childp) != 1))
1149 return 0;
1150
1151 /*
1152 * This should apply only to the host S2, which does not refcount its
1153 * default memory and mmio mappings.
1154 */
1155 WARN_ON(!(data->mmu->pgt->flags & KVM_PGTABLE_S2_IDMAP));
1156
1157 size = kvm_granule_size(ctx->level);
1158 addr = ALIGN_DOWN(ctx->addr, size);
1159
1160 debug_check_table_before_coalescing(ctx, data, childp, addr);
1161
1162 new = kvm_init_valid_leaf_pte(addr, data->attr, ctx->level);
1163
1164 /* Breaking must succeed, as this is not a shared walk. */
1165 WARN_ON(!stage2_try_break_pte(ctx, data->mmu));
1166
1167 /* Host doesn't require CMOs. */
1168 WARN_ON(mm_ops->dcache_clean_inval_poc || mm_ops->icache_inval_pou);
1169
1170 stage2_make_pte(ctx, new);
1171
1172 /* Finally, free the unlinked table. */
1173 mm_ops->put_page(childp);
1174
1175 return 0;
1176 }
1177
1178 /*
1179 * The TABLE_PRE callback runs for table entries on the way down, looking
1180 * for table entries which we could conceivably replace with a block entry
1181 * for this mapping. If it finds one it replaces the entry and calls
1182 * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table.
1183 *
1184 * Otherwise, the LEAF callback performs the mapping at the existing leaves
1185 * instead.
1186 */
stage2_map_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1187 static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
1188 enum kvm_pgtable_walk_flags visit)
1189 {
1190 struct stage2_map_data *data = ctx->arg;
1191
1192 switch (visit) {
1193 case KVM_PGTABLE_WALK_TABLE_PRE:
1194 return stage2_map_walk_table_pre(ctx, data);
1195 case KVM_PGTABLE_WALK_LEAF:
1196 return stage2_map_walk_leaf(ctx, data);
1197 case KVM_PGTABLE_WALK_TABLE_POST:
1198 return stage2_coalesce_walk_table_post(ctx, data);
1199 default:
1200 return -EINVAL;
1201 }
1202 }
1203
kvm_pgtable_stage2_map(struct kvm_pgtable * pgt,u64 addr,u64 size,u64 phys,enum kvm_pgtable_prot prot,void * mc,enum kvm_pgtable_walk_flags flags)1204 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
1205 u64 phys, enum kvm_pgtable_prot prot,
1206 void *mc, enum kvm_pgtable_walk_flags flags)
1207 {
1208 int ret;
1209 struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops;
1210 struct stage2_map_data map_data = {
1211 .phys = ALIGN_DOWN(phys, PAGE_SIZE),
1212 .mmu = pgt->mmu,
1213 .memcache = mc,
1214 .force_pte = pte_ops->force_pte_cb &&
1215 pte_ops->force_pte_cb(addr, addr + size, prot),
1216 };
1217 struct kvm_pgtable_walker walker = {
1218 .cb = stage2_map_walker,
1219 .flags = flags |
1220 KVM_PGTABLE_WALK_TABLE_PRE |
1221 KVM_PGTABLE_WALK_LEAF |
1222 KVM_PGTABLE_WALK_TABLE_POST,
1223 .arg = &map_data,
1224 };
1225
1226 if (pte_ops->force_pte_cb)
1227 map_data.force_pte = pte_ops->force_pte_cb(addr, addr + size, prot);
1228
1229 if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
1230 return -EINVAL;
1231
1232 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
1233 if (ret)
1234 return ret;
1235
1236 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1237 dsb(ishst);
1238 return ret;
1239 }
1240
kvm_pgtable_stage2_annotate(struct kvm_pgtable * pgt,u64 addr,u64 size,void * mc,kvm_pte_t annotation)1241 int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size,
1242 void *mc, kvm_pte_t annotation)
1243 {
1244 int ret;
1245 struct stage2_map_data map_data = {
1246 .phys = KVM_PHYS_INVALID,
1247 .mmu = pgt->mmu,
1248 .memcache = mc,
1249 .force_pte = true,
1250 .annotation = annotation,
1251 };
1252 struct kvm_pgtable_walker walker = {
1253 .cb = stage2_map_walker,
1254 .flags = KVM_PGTABLE_WALK_TABLE_PRE |
1255 KVM_PGTABLE_WALK_LEAF,
1256 .arg = &map_data,
1257 };
1258
1259 if (annotation & PTE_VALID)
1260 return -EINVAL;
1261
1262 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1263 return ret;
1264 }
1265
stage2_unmap_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1266 static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
1267 enum kvm_pgtable_walk_flags visit)
1268 {
1269 struct kvm_pgtable *pgt = ctx->arg;
1270 struct kvm_s2_mmu *mmu = pgt->mmu;
1271 struct kvm_pgtable_pte_ops *pte_ops = ctx->pte_ops;
1272 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1273 kvm_pte_t *childp = NULL;
1274 bool need_flush = false;
1275
1276 if (!kvm_pte_valid(ctx->old)) {
1277 if (pte_ops->pte_is_counted_cb(ctx->old, ctx->level)) {
1278 kvm_clear_pte(ctx->ptep);
1279 mm_ops->put_page(ctx->ptep);
1280 }
1281 return 0;
1282 }
1283
1284 if (kvm_pte_table(ctx->old, ctx->level)) {
1285 childp = kvm_pte_follow(ctx->old, mm_ops);
1286
1287 if (mm_ops->page_count(childp) != 1)
1288 return 0;
1289 } else if (stage2_pte_cacheable(pgt, ctx->old)) {
1290 need_flush = !stage2_has_fwb(pgt);
1291 }
1292
1293 /*
1294 * This is similar to the map() path in that we unmap the entire
1295 * block entry and rely on the remaining portions being faulted
1296 * back lazily.
1297 */
1298 if (pte_ops->pte_is_counted_cb(ctx->old, ctx->level))
1299 stage2_unmap_put_pte(ctx, mmu, mm_ops);
1300 else
1301 stage2_unmap_clear_pte(ctx, mmu);
1302
1303 if (need_flush && mm_ops->dcache_clean_inval_poc)
1304 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
1305 kvm_granule_size(ctx->level));
1306
1307 if (childp)
1308 mm_ops->put_page(childp);
1309
1310 return 0;
1311 }
1312
kvm_pgtable_stage2_unmap(struct kvm_pgtable * pgt,u64 addr,u64 size)1313 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
1314 {
1315 int ret;
1316 struct kvm_pgtable_walker walker = {
1317 .cb = stage2_unmap_walker,
1318 .arg = pgt,
1319 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
1320 };
1321
1322 /*
1323 * stage2_unmap_walker's TLBI logic is unsafe for the pKVM host stage-2
1324 * table because a child table may have a refcount of 1 while still
1325 * containing valid mappings. The use of __kvm_tlb_flush_vmid_ipa in
1326 * stage2_unmap_clear_pte is then insufficient to invalidate all leaf
1327 * mappings reachable from the child table. All other stage-2 tables
1328 * hold a reference for every non-zero PTE, and are thus guaranteed to
1329 * be completely empty when refcount is 1.
1330 */
1331 if (WARN_ON(pgt->flags & KVM_PGTABLE_S2_IDMAP))
1332 return -EINVAL;
1333
1334 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1335 if (stage2_unmap_defer_tlb_flush(pgt))
1336 /* Perform the deferred TLB invalidations */
1337 kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
1338
1339 return ret;
1340 }
1341
stage2_reclaim_leaf_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1342 static int stage2_reclaim_leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
1343 enum kvm_pgtable_walk_flags visit)
1344 {
1345 struct stage2_map_data *data = ctx->arg;
1346 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1347 kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
1348 u64 size, addr;
1349
1350 /*
1351 * If this table's refcount is not raised, we can safely discard it.
1352 * Any mappings that it contains can be re-created on demand.
1353 */
1354 if (!kvm_level_supports_block_mapping(ctx->level) ||
1355 (mm_ops->page_count(childp) != 1))
1356 return 0;
1357
1358 size = kvm_granule_size(ctx->level);
1359 addr = ALIGN_DOWN(ctx->addr, size);
1360
1361 /* Unlink the table and flush TLBs. */
1362 kvm_clear_pte(ctx->ptep);
1363 kvm_tlb_flush_vmid_range(data->mmu, addr, size);
1364
1365 /* Free the unlinked table, and drop its reference in the parent. */
1366 mm_ops->put_page(ctx->ptep);
1367 mm_ops->put_page(childp);
1368
1369 return 0;
1370 }
1371
kvm_pgtable_stage2_reclaim_leaves(struct kvm_pgtable * pgt,u64 addr,u64 size)1372 int kvm_pgtable_stage2_reclaim_leaves(struct kvm_pgtable *pgt, u64 addr, u64 size)
1373 {
1374 struct stage2_map_data map_data = {
1375 .phys = KVM_PHYS_INVALID,
1376 .mmu = pgt->mmu,
1377 };
1378 struct kvm_pgtable_walker walker = {
1379 .cb = stage2_reclaim_leaf_walker,
1380 .arg = &map_data,
1381 .flags = KVM_PGTABLE_WALK_TABLE_POST,
1382 };
1383
1384 return kvm_pgtable_walk(pgt, addr, size, &walker);
1385 }
1386
1387 struct stage2_attr_data {
1388 kvm_pte_t attr_set;
1389 kvm_pte_t attr_clr;
1390 kvm_pte_t pte;
1391 s8 level;
1392 };
1393
stage2_attr_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1394 static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
1395 enum kvm_pgtable_walk_flags visit)
1396 {
1397 kvm_pte_t pte = ctx->old;
1398 struct stage2_attr_data *data = ctx->arg;
1399 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1400
1401 if (!kvm_pte_valid(ctx->old))
1402 return -EAGAIN;
1403
1404 data->level = ctx->level;
1405 data->pte = pte;
1406 pte &= ~data->attr_clr;
1407 pte |= data->attr_set;
1408
1409 /*
1410 * We may race with the CPU trying to set the access flag here,
1411 * but worst-case the access flag update gets lost and will be
1412 * set on the next access instead.
1413 */
1414 if (data->pte != pte) {
1415 /*
1416 * Invalidate instruction cache before updating the guest
1417 * stage-2 PTE if we are going to add executable permission.
1418 */
1419 if (mm_ops->icache_inval_pou &&
1420 stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
1421 mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
1422 kvm_granule_size(ctx->level));
1423
1424 if (!stage2_try_set_pte(ctx, pte))
1425 return -EAGAIN;
1426 }
1427
1428 return 0;
1429 }
1430
stage2_update_leaf_attrs(struct kvm_pgtable * pgt,u64 addr,u64 size,kvm_pte_t attr_set,kvm_pte_t attr_clr,kvm_pte_t * orig_pte,s8 * level,enum kvm_pgtable_walk_flags flags)1431 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
1432 u64 size, kvm_pte_t attr_set,
1433 kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
1434 s8 *level, enum kvm_pgtable_walk_flags flags)
1435 {
1436 int ret;
1437 kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
1438 struct stage2_attr_data data = {
1439 .attr_set = attr_set & attr_mask,
1440 .attr_clr = attr_clr & attr_mask,
1441 };
1442 struct kvm_pgtable_walker walker = {
1443 .cb = stage2_attr_walker,
1444 .arg = &data,
1445 .flags = flags | KVM_PGTABLE_WALK_LEAF,
1446 };
1447
1448 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1449 if (ret)
1450 return ret;
1451
1452 if (orig_pte)
1453 *orig_pte = data.pte;
1454
1455 if (level)
1456 *level = data.level;
1457 return 0;
1458 }
1459
kvm_pgtable_stage2_wrprotect(struct kvm_pgtable * pgt,u64 addr,u64 size)1460 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
1461 {
1462 return stage2_update_leaf_attrs(pgt, addr, size, 0,
1463 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
1464 NULL, NULL, 0);
1465 }
1466
kvm_pgtable_stage2_mkyoung(struct kvm_pgtable * pgt,u64 addr,enum kvm_pgtable_walk_flags flags)1467 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
1468 enum kvm_pgtable_walk_flags flags)
1469 {
1470 kvm_pte_t pte = 0;
1471 int ret;
1472
1473 ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
1474 &pte, NULL, flags);
1475 if (!ret)
1476 dsb(ishst);
1477
1478 return pte;
1479 }
1480
1481 struct stage2_age_data {
1482 bool mkold;
1483 bool young;
1484 };
1485
stage2_age_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1486 static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
1487 enum kvm_pgtable_walk_flags visit)
1488 {
1489 kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
1490 struct stage2_age_data *data = ctx->arg;
1491
1492 if (!kvm_pte_valid(ctx->old) || new == ctx->old)
1493 return 0;
1494
1495 data->young = true;
1496
1497 /*
1498 * stage2_age_walker() is always called while holding the MMU lock for
1499 * write, so this will always succeed. Nonetheless, this deliberately
1500 * follows the race detection pattern of the other stage-2 walkers in
1501 * case the locking mechanics of the MMU notifiers is ever changed.
1502 */
1503 if (data->mkold && !stage2_try_set_pte(ctx, new))
1504 return -EAGAIN;
1505
1506 /*
1507 * "But where's the TLBI?!", you scream.
1508 * "Over in the core code", I sigh.
1509 *
1510 * See the '->clear_flush_young()' callback on the KVM mmu notifier.
1511 */
1512 return 0;
1513 }
1514
kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable * pgt,u64 addr,u64 size,bool mkold)1515 bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
1516 u64 size, bool mkold)
1517 {
1518 struct stage2_age_data data = {
1519 .mkold = mkold,
1520 };
1521 struct kvm_pgtable_walker walker = {
1522 .cb = stage2_age_walker,
1523 .arg = &data,
1524 .flags = KVM_PGTABLE_WALK_LEAF,
1525 };
1526
1527 WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
1528 return data.young;
1529 }
1530
kvm_pgtable_stage2_relax_perms(struct kvm_pgtable * pgt,u64 addr,enum kvm_pgtable_prot prot,enum kvm_pgtable_walk_flags flags)1531 int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
1532 enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags)
1533 {
1534 int ret;
1535 s8 level;
1536 kvm_pte_t set = 0, clr = 0;
1537
1538 if (prot & ~KVM_PGTABLE_PROT_RWX)
1539 return -EINVAL;
1540
1541 if (prot & KVM_PGTABLE_PROT_R)
1542 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
1543
1544 if (prot & KVM_PGTABLE_PROT_W)
1545 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
1546
1547 if (prot & KVM_PGTABLE_PROT_X)
1548 clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
1549
1550 ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags);
1551 if (!ret || ret == -EAGAIN)
1552 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
1553 return ret;
1554 }
1555
stage2_flush_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1556 static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
1557 enum kvm_pgtable_walk_flags visit)
1558 {
1559 struct kvm_pgtable *pgt = ctx->arg;
1560 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
1561
1562 if (!stage2_pte_cacheable(pgt, ctx->old))
1563 return 0;
1564
1565 if (mm_ops->dcache_clean_inval_poc)
1566 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
1567 kvm_granule_size(ctx->level));
1568 return 0;
1569 }
1570
kvm_pgtable_stage2_flush(struct kvm_pgtable * pgt,u64 addr,u64 size)1571 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
1572 {
1573 struct kvm_pgtable_walker walker = {
1574 .cb = stage2_flush_walker,
1575 .flags = KVM_PGTABLE_WALK_LEAF,
1576 .arg = pgt,
1577 };
1578
1579 if (stage2_has_fwb(pgt))
1580 return 0;
1581
1582 return kvm_pgtable_walk(pgt, addr, size, &walker);
1583 }
1584
kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable * pgt,u64 phys,s8 level,enum kvm_pgtable_prot prot,void * mc,bool force_pte)1585 kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
1586 u64 phys, s8 level,
1587 enum kvm_pgtable_prot prot,
1588 void *mc, bool force_pte)
1589 {
1590 struct stage2_map_data map_data = {
1591 .phys = phys,
1592 .mmu = pgt->mmu,
1593 .memcache = mc,
1594 .force_pte = force_pte,
1595 };
1596 struct kvm_pgtable_walker walker = {
1597 .cb = stage2_map_walker,
1598 .flags = KVM_PGTABLE_WALK_LEAF |
1599 KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
1600 KVM_PGTABLE_WALK_SKIP_CMO,
1601 .arg = &map_data,
1602 };
1603 /*
1604 * The input address (.addr) is irrelevant for walking an
1605 * unlinked table. Construct an ambiguous IA range to map
1606 * kvm_granule_size(level) worth of memory.
1607 */
1608 struct kvm_pgtable_walk_data data = {
1609 .walker = &walker,
1610 .addr = 0,
1611 .end = kvm_granule_size(level),
1612 };
1613 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
1614 struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops;
1615 kvm_pte_t *pgtable;
1616 int ret;
1617
1618 if (!IS_ALIGNED(phys, kvm_granule_size(level)))
1619 return ERR_PTR(-EINVAL);
1620
1621 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
1622 if (ret)
1623 return ERR_PTR(ret);
1624
1625 pgtable = mm_ops->zalloc_page(mc);
1626 if (!pgtable)
1627 return ERR_PTR(-ENOMEM);
1628
1629 ret = __kvm_pgtable_walk(&data, mm_ops, pte_ops, (kvm_pteref_t)pgtable,
1630 level + 1);
1631 if (ret) {
1632 kvm_pgtable_stage2_free_unlinked(mm_ops, pte_ops, pgtable, level);
1633 return ERR_PTR(ret);
1634 }
1635
1636 return pgtable;
1637 }
1638
1639 /*
1640 * Get the number of page-tables needed to replace a block with a
1641 * fully populated tree up to the PTE entries. Note that @level is
1642 * interpreted as in "level @level entry".
1643 */
stage2_block_get_nr_page_tables(s8 level)1644 static int stage2_block_get_nr_page_tables(s8 level)
1645 {
1646 switch (level) {
1647 case 1:
1648 return PTRS_PER_PTE + 1;
1649 case 2:
1650 return 1;
1651 case 3:
1652 return 0;
1653 default:
1654 WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
1655 level > KVM_PGTABLE_LAST_LEVEL);
1656 return -EINVAL;
1657 };
1658 }
1659
stage2_split_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1660 static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
1661 enum kvm_pgtable_walk_flags visit)
1662 {
1663 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1664 struct kvm_mmu_memory_cache *mc = ctx->arg;
1665 struct kvm_s2_mmu *mmu;
1666 kvm_pte_t pte = ctx->old, new, *childp;
1667 enum kvm_pgtable_prot prot;
1668 s8 level = ctx->level;
1669 struct kvm_pgtable_pte_ops *pte_ops = ctx->pte_ops;
1670 bool force_pte;
1671 int nr_pages;
1672 u64 phys;
1673
1674 /* No huge-pages exist at the last level */
1675 if (level == KVM_PGTABLE_LAST_LEVEL)
1676 return 0;
1677
1678 /* We only split valid block mappings */
1679 if (!kvm_pte_valid(pte))
1680 return 0;
1681
1682 nr_pages = stage2_block_get_nr_page_tables(level);
1683 if (nr_pages < 0)
1684 return nr_pages;
1685
1686 if (mc->nobjs >= nr_pages) {
1687 /* Build a tree mapped down to the PTE granularity. */
1688 force_pte = true;
1689 } else {
1690 /*
1691 * Don't force PTEs, so create_unlinked() below does
1692 * not populate the tree up to the PTE level. The
1693 * consequence is that the call will require a single
1694 * page of level 2 entries at level 1, or a single
1695 * page of PTEs at level 2. If we are at level 1, the
1696 * PTEs will be created recursively.
1697 */
1698 force_pte = false;
1699 nr_pages = 1;
1700 }
1701
1702 if (mc->nobjs < nr_pages)
1703 return -ENOMEM;
1704
1705 mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
1706 phys = kvm_pte_to_phys(pte);
1707 prot = kvm_pgtable_stage2_pte_prot(pte);
1708
1709 childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
1710 level, prot, mc, force_pte);
1711 if (IS_ERR(childp))
1712 return PTR_ERR(childp);
1713
1714 if (!stage2_try_break_pte(ctx, mmu)) {
1715 kvm_pgtable_stage2_free_unlinked(mm_ops, pte_ops, childp, level);
1716 return -EAGAIN;
1717 }
1718
1719 /*
1720 * Note, the contents of the page table are guaranteed to be made
1721 * visible before the new PTE is assigned because stage2_make_pte()
1722 * writes the PTE using smp_store_release().
1723 */
1724 new = kvm_init_table_pte(childp, mm_ops);
1725 stage2_make_pte(ctx, new);
1726 return 0;
1727 }
1728
pkvm_stage2_split_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1729 static int pkvm_stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
1730 enum kvm_pgtable_walk_flags visit)
1731 {
1732 struct stage2_map_data *data = ctx->arg;
1733 struct kvm_pgtable *pgt = data->mmu->pgt;
1734 struct kvm_hyp_memcache *mc = data->memcache;
1735 enum kvm_pgtable_prot prot;
1736 kvm_pte_t pte = ctx->old;
1737 kvm_pte_t *childp;
1738
1739 if (ctx->level == KVM_PGTABLE_LAST_LEVEL)
1740 return 0;
1741
1742 /* We can only split PMD-level blocks */
1743 if (!kvm_pte_valid(pte) || ctx->level != KVM_PGTABLE_LAST_LEVEL - 1)
1744 return -EINVAL;
1745
1746 prot = kvm_pgtable_stage2_pte_prot(pte);
1747 childp = kvm_pgtable_stage2_create_unlinked(pgt, kvm_pte_to_phys(pte),
1748 ctx->level, prot, mc, true);
1749 if (IS_ERR(childp))
1750 return PTR_ERR(childp);
1751
1752 WARN_ON(!stage2_try_break_pte(ctx, data->mmu));
1753
1754 stage2_make_pte(ctx, kvm_init_table_pte(childp, ctx->mm_ops));
1755
1756 return 0;
1757 }
1758
kvm_pgtable_stage2_split(struct kvm_pgtable * pgt,u64 addr,u64 size,void * mc)1759 int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc)
1760 {
1761 struct stage2_map_data data = {
1762 .mmu = pgt->mmu,
1763 .memcache = mc,
1764 };
1765 struct kvm_pgtable_walker walker = {
1766 .cb = static_branch_unlikely(&kvm_protected_mode_initialized) ?
1767 pkvm_stage2_split_walker : stage2_split_walker,
1768 .arg = static_branch_unlikely(&kvm_protected_mode_initialized) ?
1769 &data : mc,
1770 .flags = KVM_PGTABLE_WALK_LEAF,
1771 };
1772 int ret;
1773
1774 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1775 dsb(ishst);
1776 return ret;
1777 }
1778
__kvm_pgtable_stage2_init(struct kvm_pgtable * pgt,struct kvm_s2_mmu * mmu,struct kvm_pgtable_mm_ops * mm_ops,enum kvm_pgtable_stage2_flags flags,struct kvm_pgtable_pte_ops * pte_ops)1779 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
1780 struct kvm_pgtable_mm_ops *mm_ops,
1781 enum kvm_pgtable_stage2_flags flags,
1782 struct kvm_pgtable_pte_ops *pte_ops)
1783 {
1784 size_t pgd_sz;
1785 u64 vtcr = mmu->vtcr;
1786 u32 ia_bits = VTCR_EL2_IPA(vtcr);
1787 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1788 s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1789
1790 pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1791 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz);
1792 if (!pgt->pgd)
1793 return -ENOMEM;
1794
1795 pgt->ia_bits = ia_bits;
1796 pgt->start_level = start_level;
1797 pgt->mm_ops = mm_ops;
1798 pgt->mmu = mmu;
1799 pgt->flags = flags;
1800 pgt->pte_ops = pte_ops;
1801
1802 /* Ensure zeroed PGD pages are visible to the hardware walker */
1803 dsb(ishst);
1804 return 0;
1805 }
1806
kvm_pgtable_stage2_pgd_size(u64 vtcr)1807 size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
1808 {
1809 u32 ia_bits = VTCR_EL2_IPA(vtcr);
1810 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1811 s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1812
1813 return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1814 }
1815
stage2_free_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1816 static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
1817 enum kvm_pgtable_walk_flags visit)
1818 {
1819 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1820 struct kvm_pgtable_pte_ops *pte_ops = ctx->pte_ops;
1821
1822 if (!pte_ops->pte_is_counted_cb(ctx->old, ctx->level))
1823 return 0;
1824
1825 mm_ops->put_page(ctx->ptep);
1826
1827 if (kvm_pte_table(ctx->old, ctx->level))
1828 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
1829
1830 return 0;
1831 }
1832
kvm_pgtable_stage2_destroy(struct kvm_pgtable * pgt)1833 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
1834 {
1835 size_t pgd_sz;
1836 struct kvm_pgtable_walker walker = {
1837 .cb = stage2_free_walker,
1838 .flags = KVM_PGTABLE_WALK_LEAF |
1839 KVM_PGTABLE_WALK_TABLE_POST,
1840 };
1841
1842 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
1843 pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
1844 pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
1845 pgt->pgd = NULL;
1846 }
1847
kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops * mm_ops,struct kvm_pgtable_pte_ops * pte_ops,void * pgtable,s8 level)1848 void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops,
1849 struct kvm_pgtable_pte_ops *pte_ops,
1850 void *pgtable, s8 level)
1851 {
1852 kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
1853 struct kvm_pgtable_walker walker = {
1854 .cb = stage2_free_walker,
1855 .flags = KVM_PGTABLE_WALK_LEAF |
1856 KVM_PGTABLE_WALK_TABLE_POST,
1857 };
1858 struct kvm_pgtable_walk_data data = {
1859 .walker = &walker,
1860
1861 /*
1862 * At this point the IPA really doesn't matter, as the page
1863 * table being traversed has already been removed from the stage
1864 * 2. Set an appropriate range to cover the entire page table.
1865 */
1866 .addr = 0,
1867 .end = kvm_granule_size(level),
1868 };
1869
1870 WARN_ON(__kvm_pgtable_walk(&data, mm_ops, pte_ops, ptep, level + 1));
1871
1872 WARN_ON(mm_ops->page_count(pgtable) != 1);
1873 mm_ops->put_page(pgtable);
1874 }
1875