1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/mman.h> 8 #include <linux/kvm_host.h> 9 #include <linux/io.h> 10 #include <linux/hugetlb.h> 11 #include <linux/sched/signal.h> 12 #include <trace/events/kvm.h> 13 #include <asm/pgalloc.h> 14 #include <asm/cacheflush.h> 15 #include <asm/kvm_arm.h> 16 #include <asm/kvm_mmu.h> 17 #include <asm/kvm_mmio.h> 18 #include <asm/kvm_ras.h> 19 #include <asm/kvm_asm.h> 20 #include <asm/kvm_emulate.h> 21 #include <asm/virt.h> 22 23 #include "trace.h" 24 25 static pgd_t *boot_hyp_pgd; 26 static pgd_t *hyp_pgd; 27 static pgd_t *merged_hyp_pgd; 28 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 29 30 static unsigned long hyp_idmap_start; 31 static unsigned long hyp_idmap_end; 32 static phys_addr_t hyp_idmap_vector; 33 34 static unsigned long io_map_base; 35 36 #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) 37 38 #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) 39 #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) 40 is_iomap(unsigned long flags)41 static bool is_iomap(unsigned long flags) 42 { 43 return flags & KVM_S2PTE_FLAG_IS_IOMAP; 44 } 45 memslot_is_logging(struct kvm_memory_slot * memslot)46 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 47 { 48 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 49 } 50 51 /** 52 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 53 * @kvm: pointer to kvm structure. 54 * 55 * Interface to HYP function to flush all VM TLB entries 56 */ kvm_flush_remote_tlbs(struct kvm * kvm)57 void kvm_flush_remote_tlbs(struct kvm *kvm) 58 { 59 kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); 60 } 61 kvm_tlb_flush_vmid_ipa(struct kvm * kvm,phys_addr_t ipa)62 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 63 { 64 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); 65 } 66 67 /* 68 * D-Cache management functions. They take the page table entries by 69 * value, as they are flushing the cache using the kernel mapping (or 70 * kmap on 32bit). 71 */ kvm_flush_dcache_pte(pte_t pte)72 static void kvm_flush_dcache_pte(pte_t pte) 73 { 74 __kvm_flush_dcache_pte(pte); 75 } 76 kvm_flush_dcache_pmd(pmd_t pmd)77 static void kvm_flush_dcache_pmd(pmd_t pmd) 78 { 79 __kvm_flush_dcache_pmd(pmd); 80 } 81 kvm_flush_dcache_pud(pud_t pud)82 static void kvm_flush_dcache_pud(pud_t pud) 83 { 84 __kvm_flush_dcache_pud(pud); 85 } 86 kvm_is_device_pfn(unsigned long pfn)87 static bool kvm_is_device_pfn(unsigned long pfn) 88 { 89 return !pfn_valid(pfn); 90 } 91 92 /** 93 * stage2_dissolve_pmd() - clear and flush huge PMD entry 94 * @kvm: pointer to kvm structure. 95 * @addr: IPA 96 * @pmd: pmd pointer for IPA 97 * 98 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. 99 */ stage2_dissolve_pmd(struct kvm * kvm,phys_addr_t addr,pmd_t * pmd)100 static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) 101 { 102 if (!pmd_thp_or_huge(*pmd)) 103 return; 104 105 pmd_clear(pmd); 106 kvm_tlb_flush_vmid_ipa(kvm, addr); 107 put_page(virt_to_page(pmd)); 108 } 109 110 /** 111 * stage2_dissolve_pud() - clear and flush huge PUD entry 112 * @kvm: pointer to kvm structure. 113 * @addr: IPA 114 * @pud: pud pointer for IPA 115 * 116 * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. 117 */ stage2_dissolve_pud(struct kvm * kvm,phys_addr_t addr,pud_t * pudp)118 static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) 119 { 120 if (!stage2_pud_huge(kvm, *pudp)) 121 return; 122 123 stage2_pud_clear(kvm, pudp); 124 kvm_tlb_flush_vmid_ipa(kvm, addr); 125 put_page(virt_to_page(pudp)); 126 } 127 mmu_topup_memory_cache(struct kvm_mmu_memory_cache * cache,int min,int max)128 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 129 int min, int max) 130 { 131 void *page; 132 133 BUG_ON(max > KVM_NR_MEM_OBJS); 134 if (cache->nobjs >= min) 135 return 0; 136 while (cache->nobjs < max) { 137 page = (void *)__get_free_page(GFP_PGTABLE_USER); 138 if (!page) 139 return -ENOMEM; 140 cache->objects[cache->nobjs++] = page; 141 } 142 return 0; 143 } 144 mmu_free_memory_cache(struct kvm_mmu_memory_cache * mc)145 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 146 { 147 while (mc->nobjs) 148 free_page((unsigned long)mc->objects[--mc->nobjs]); 149 } 150 mmu_memory_cache_alloc(struct kvm_mmu_memory_cache * mc)151 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 152 { 153 void *p; 154 155 BUG_ON(!mc || !mc->nobjs); 156 p = mc->objects[--mc->nobjs]; 157 return p; 158 } 159 clear_stage2_pgd_entry(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr)160 static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) 161 { 162 pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); 163 stage2_pgd_clear(kvm, pgd); 164 kvm_tlb_flush_vmid_ipa(kvm, addr); 165 stage2_pud_free(kvm, pud_table); 166 put_page(virt_to_page(pgd)); 167 } 168 clear_stage2_pud_entry(struct kvm * kvm,pud_t * pud,phys_addr_t addr)169 static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) 170 { 171 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); 172 VM_BUG_ON(stage2_pud_huge(kvm, *pud)); 173 stage2_pud_clear(kvm, pud); 174 kvm_tlb_flush_vmid_ipa(kvm, addr); 175 stage2_pmd_free(kvm, pmd_table); 176 put_page(virt_to_page(pud)); 177 } 178 clear_stage2_pmd_entry(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr)179 static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) 180 { 181 pte_t *pte_table = pte_offset_kernel(pmd, 0); 182 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 183 pmd_clear(pmd); 184 kvm_tlb_flush_vmid_ipa(kvm, addr); 185 free_page((unsigned long)pte_table); 186 put_page(virt_to_page(pmd)); 187 } 188 kvm_set_pte(pte_t * ptep,pte_t new_pte)189 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) 190 { 191 WRITE_ONCE(*ptep, new_pte); 192 dsb(ishst); 193 } 194 kvm_set_pmd(pmd_t * pmdp,pmd_t new_pmd)195 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) 196 { 197 WRITE_ONCE(*pmdp, new_pmd); 198 dsb(ishst); 199 } 200 kvm_pmd_populate(pmd_t * pmdp,pte_t * ptep)201 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) 202 { 203 kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); 204 } 205 kvm_pud_populate(pud_t * pudp,pmd_t * pmdp)206 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) 207 { 208 WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); 209 dsb(ishst); 210 } 211 kvm_pgd_populate(pgd_t * pgdp,pud_t * pudp)212 static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp) 213 { 214 WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp)); 215 dsb(ishst); 216 } 217 218 /* 219 * Unmapping vs dcache management: 220 * 221 * If a guest maps certain memory pages as uncached, all writes will 222 * bypass the data cache and go directly to RAM. However, the CPUs 223 * can still speculate reads (not writes) and fill cache lines with 224 * data. 225 * 226 * Those cache lines will be *clean* cache lines though, so a 227 * clean+invalidate operation is equivalent to an invalidate 228 * operation, because no cache lines are marked dirty. 229 * 230 * Those clean cache lines could be filled prior to an uncached write 231 * by the guest, and the cache coherent IO subsystem would therefore 232 * end up writing old data to disk. 233 * 234 * This is why right after unmapping a page/section and invalidating 235 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure 236 * the IO subsystem will never hit in the cache. 237 * 238 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 239 * we then fully enforce cacheability of RAM, no matter what the guest 240 * does. 241 */ unmap_stage2_ptes(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr,phys_addr_t end)242 static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, 243 phys_addr_t addr, phys_addr_t end) 244 { 245 phys_addr_t start_addr = addr; 246 pte_t *pte, *start_pte; 247 248 start_pte = pte = pte_offset_kernel(pmd, addr); 249 do { 250 if (!pte_none(*pte)) { 251 pte_t old_pte = *pte; 252 253 kvm_set_pte(pte, __pte(0)); 254 kvm_tlb_flush_vmid_ipa(kvm, addr); 255 256 /* No need to invalidate the cache for device mappings */ 257 if (!kvm_is_device_pfn(pte_pfn(old_pte))) 258 kvm_flush_dcache_pte(old_pte); 259 260 put_page(virt_to_page(pte)); 261 } 262 } while (pte++, addr += PAGE_SIZE, addr != end); 263 264 if (stage2_pte_table_empty(kvm, start_pte)) 265 clear_stage2_pmd_entry(kvm, pmd, start_addr); 266 } 267 unmap_stage2_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)268 static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, 269 phys_addr_t addr, phys_addr_t end) 270 { 271 phys_addr_t next, start_addr = addr; 272 pmd_t *pmd, *start_pmd; 273 274 start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); 275 do { 276 next = stage2_pmd_addr_end(kvm, addr, end); 277 if (!pmd_none(*pmd)) { 278 if (pmd_thp_or_huge(*pmd)) { 279 pmd_t old_pmd = *pmd; 280 281 pmd_clear(pmd); 282 kvm_tlb_flush_vmid_ipa(kvm, addr); 283 284 kvm_flush_dcache_pmd(old_pmd); 285 286 put_page(virt_to_page(pmd)); 287 } else { 288 unmap_stage2_ptes(kvm, pmd, addr, next); 289 } 290 } 291 } while (pmd++, addr = next, addr != end); 292 293 if (stage2_pmd_table_empty(kvm, start_pmd)) 294 clear_stage2_pud_entry(kvm, pud, start_addr); 295 } 296 unmap_stage2_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)297 static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, 298 phys_addr_t addr, phys_addr_t end) 299 { 300 phys_addr_t next, start_addr = addr; 301 pud_t *pud, *start_pud; 302 303 start_pud = pud = stage2_pud_offset(kvm, pgd, addr); 304 do { 305 next = stage2_pud_addr_end(kvm, addr, end); 306 if (!stage2_pud_none(kvm, *pud)) { 307 if (stage2_pud_huge(kvm, *pud)) { 308 pud_t old_pud = *pud; 309 310 stage2_pud_clear(kvm, pud); 311 kvm_tlb_flush_vmid_ipa(kvm, addr); 312 kvm_flush_dcache_pud(old_pud); 313 put_page(virt_to_page(pud)); 314 } else { 315 unmap_stage2_pmds(kvm, pud, addr, next); 316 } 317 } 318 } while (pud++, addr = next, addr != end); 319 320 if (stage2_pud_table_empty(kvm, start_pud)) 321 clear_stage2_pgd_entry(kvm, pgd, start_addr); 322 } 323 324 /** 325 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 326 * @kvm: The VM pointer 327 * @start: The intermediate physical base address of the range to unmap 328 * @size: The size of the area to unmap 329 * 330 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 331 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 332 * destroying the VM), otherwise another faulting VCPU may come in and mess 333 * with things behind our backs. 334 */ unmap_stage2_range(struct kvm * kvm,phys_addr_t start,u64 size)335 static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 336 { 337 pgd_t *pgd; 338 phys_addr_t addr = start, end = start + size; 339 phys_addr_t next; 340 341 assert_spin_locked(&kvm->mmu_lock); 342 WARN_ON(size & ~PAGE_MASK); 343 344 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 345 do { 346 /* 347 * Make sure the page table is still active, as another thread 348 * could have possibly freed the page table, while we released 349 * the lock. 350 */ 351 if (!READ_ONCE(kvm->arch.pgd)) 352 break; 353 next = stage2_pgd_addr_end(kvm, addr, end); 354 if (!stage2_pgd_none(kvm, *pgd)) 355 unmap_stage2_puds(kvm, pgd, addr, next); 356 /* 357 * If the range is too large, release the kvm->mmu_lock 358 * to prevent starvation and lockup detector warnings. 359 */ 360 if (next != end) 361 cond_resched_lock(&kvm->mmu_lock); 362 } while (pgd++, addr = next, addr != end); 363 } 364 stage2_flush_ptes(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr,phys_addr_t end)365 static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, 366 phys_addr_t addr, phys_addr_t end) 367 { 368 pte_t *pte; 369 370 pte = pte_offset_kernel(pmd, addr); 371 do { 372 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) 373 kvm_flush_dcache_pte(*pte); 374 } while (pte++, addr += PAGE_SIZE, addr != end); 375 } 376 stage2_flush_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)377 static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, 378 phys_addr_t addr, phys_addr_t end) 379 { 380 pmd_t *pmd; 381 phys_addr_t next; 382 383 pmd = stage2_pmd_offset(kvm, pud, addr); 384 do { 385 next = stage2_pmd_addr_end(kvm, addr, end); 386 if (!pmd_none(*pmd)) { 387 if (pmd_thp_or_huge(*pmd)) 388 kvm_flush_dcache_pmd(*pmd); 389 else 390 stage2_flush_ptes(kvm, pmd, addr, next); 391 } 392 } while (pmd++, addr = next, addr != end); 393 } 394 stage2_flush_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)395 static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, 396 phys_addr_t addr, phys_addr_t end) 397 { 398 pud_t *pud; 399 phys_addr_t next; 400 401 pud = stage2_pud_offset(kvm, pgd, addr); 402 do { 403 next = stage2_pud_addr_end(kvm, addr, end); 404 if (!stage2_pud_none(kvm, *pud)) { 405 if (stage2_pud_huge(kvm, *pud)) 406 kvm_flush_dcache_pud(*pud); 407 else 408 stage2_flush_pmds(kvm, pud, addr, next); 409 } 410 } while (pud++, addr = next, addr != end); 411 } 412 stage2_flush_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)413 static void stage2_flush_memslot(struct kvm *kvm, 414 struct kvm_memory_slot *memslot) 415 { 416 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 417 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 418 phys_addr_t next; 419 pgd_t *pgd; 420 421 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 422 do { 423 next = stage2_pgd_addr_end(kvm, addr, end); 424 if (!stage2_pgd_none(kvm, *pgd)) 425 stage2_flush_puds(kvm, pgd, addr, next); 426 } while (pgd++, addr = next, addr != end); 427 } 428 429 /** 430 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 431 * @kvm: The struct kvm pointer 432 * 433 * Go through the stage 2 page tables and invalidate any cache lines 434 * backing memory already mapped to the VM. 435 */ stage2_flush_vm(struct kvm * kvm)436 static void stage2_flush_vm(struct kvm *kvm) 437 { 438 struct kvm_memslots *slots; 439 struct kvm_memory_slot *memslot; 440 int idx; 441 442 idx = srcu_read_lock(&kvm->srcu); 443 spin_lock(&kvm->mmu_lock); 444 445 slots = kvm_memslots(kvm); 446 kvm_for_each_memslot(memslot, slots) 447 stage2_flush_memslot(kvm, memslot); 448 449 spin_unlock(&kvm->mmu_lock); 450 srcu_read_unlock(&kvm->srcu, idx); 451 } 452 clear_hyp_pgd_entry(pgd_t * pgd)453 static void clear_hyp_pgd_entry(pgd_t *pgd) 454 { 455 pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); 456 pgd_clear(pgd); 457 pud_free(NULL, pud_table); 458 put_page(virt_to_page(pgd)); 459 } 460 clear_hyp_pud_entry(pud_t * pud)461 static void clear_hyp_pud_entry(pud_t *pud) 462 { 463 pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); 464 VM_BUG_ON(pud_huge(*pud)); 465 pud_clear(pud); 466 pmd_free(NULL, pmd_table); 467 put_page(virt_to_page(pud)); 468 } 469 clear_hyp_pmd_entry(pmd_t * pmd)470 static void clear_hyp_pmd_entry(pmd_t *pmd) 471 { 472 pte_t *pte_table = pte_offset_kernel(pmd, 0); 473 VM_BUG_ON(pmd_thp_or_huge(*pmd)); 474 pmd_clear(pmd); 475 pte_free_kernel(NULL, pte_table); 476 put_page(virt_to_page(pmd)); 477 } 478 unmap_hyp_ptes(pmd_t * pmd,phys_addr_t addr,phys_addr_t end)479 static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 480 { 481 pte_t *pte, *start_pte; 482 483 start_pte = pte = pte_offset_kernel(pmd, addr); 484 do { 485 if (!pte_none(*pte)) { 486 kvm_set_pte(pte, __pte(0)); 487 put_page(virt_to_page(pte)); 488 } 489 } while (pte++, addr += PAGE_SIZE, addr != end); 490 491 if (hyp_pte_table_empty(start_pte)) 492 clear_hyp_pmd_entry(pmd); 493 } 494 unmap_hyp_pmds(pud_t * pud,phys_addr_t addr,phys_addr_t end)495 static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) 496 { 497 phys_addr_t next; 498 pmd_t *pmd, *start_pmd; 499 500 start_pmd = pmd = pmd_offset(pud, addr); 501 do { 502 next = pmd_addr_end(addr, end); 503 /* Hyp doesn't use huge pmds */ 504 if (!pmd_none(*pmd)) 505 unmap_hyp_ptes(pmd, addr, next); 506 } while (pmd++, addr = next, addr != end); 507 508 if (hyp_pmd_table_empty(start_pmd)) 509 clear_hyp_pud_entry(pud); 510 } 511 unmap_hyp_puds(pgd_t * pgd,phys_addr_t addr,phys_addr_t end)512 static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) 513 { 514 phys_addr_t next; 515 pud_t *pud, *start_pud; 516 517 start_pud = pud = pud_offset(pgd, addr); 518 do { 519 next = pud_addr_end(addr, end); 520 /* Hyp doesn't use huge puds */ 521 if (!pud_none(*pud)) 522 unmap_hyp_pmds(pud, addr, next); 523 } while (pud++, addr = next, addr != end); 524 525 if (hyp_pud_table_empty(start_pud)) 526 clear_hyp_pgd_entry(pgd); 527 } 528 kvm_pgd_index(unsigned long addr,unsigned int ptrs_per_pgd)529 static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) 530 { 531 return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); 532 } 533 __unmap_hyp_range(pgd_t * pgdp,unsigned long ptrs_per_pgd,phys_addr_t start,u64 size)534 static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, 535 phys_addr_t start, u64 size) 536 { 537 pgd_t *pgd; 538 phys_addr_t addr = start, end = start + size; 539 phys_addr_t next; 540 541 /* 542 * We don't unmap anything from HYP, except at the hyp tear down. 543 * Hence, we don't have to invalidate the TLBs here. 544 */ 545 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); 546 do { 547 next = pgd_addr_end(addr, end); 548 if (!pgd_none(*pgd)) 549 unmap_hyp_puds(pgd, addr, next); 550 } while (pgd++, addr = next, addr != end); 551 } 552 unmap_hyp_range(pgd_t * pgdp,phys_addr_t start,u64 size)553 static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) 554 { 555 __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); 556 } 557 unmap_hyp_idmap_range(pgd_t * pgdp,phys_addr_t start,u64 size)558 static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) 559 { 560 __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); 561 } 562 563 /** 564 * free_hyp_pgds - free Hyp-mode page tables 565 * 566 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and 567 * therefore contains either mappings in the kernel memory area (above 568 * PAGE_OFFSET), or device mappings in the idmap range. 569 * 570 * boot_hyp_pgd should only map the idmap range, and is only used in 571 * the extended idmap case. 572 */ free_hyp_pgds(void)573 void free_hyp_pgds(void) 574 { 575 pgd_t *id_pgd; 576 577 mutex_lock(&kvm_hyp_pgd_mutex); 578 579 id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; 580 581 if (id_pgd) { 582 /* In case we never called hyp_mmu_init() */ 583 if (!io_map_base) 584 io_map_base = hyp_idmap_start; 585 unmap_hyp_idmap_range(id_pgd, io_map_base, 586 hyp_idmap_start + PAGE_SIZE - io_map_base); 587 } 588 589 if (boot_hyp_pgd) { 590 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); 591 boot_hyp_pgd = NULL; 592 } 593 594 if (hyp_pgd) { 595 unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), 596 (uintptr_t)high_memory - PAGE_OFFSET); 597 598 free_pages((unsigned long)hyp_pgd, hyp_pgd_order); 599 hyp_pgd = NULL; 600 } 601 if (merged_hyp_pgd) { 602 clear_page(merged_hyp_pgd); 603 free_page((unsigned long)merged_hyp_pgd); 604 merged_hyp_pgd = NULL; 605 } 606 607 mutex_unlock(&kvm_hyp_pgd_mutex); 608 } 609 create_hyp_pte_mappings(pmd_t * pmd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)610 static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 611 unsigned long end, unsigned long pfn, 612 pgprot_t prot) 613 { 614 pte_t *pte; 615 unsigned long addr; 616 617 addr = start; 618 do { 619 pte = pte_offset_kernel(pmd, addr); 620 kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); 621 get_page(virt_to_page(pte)); 622 pfn++; 623 } while (addr += PAGE_SIZE, addr != end); 624 } 625 create_hyp_pmd_mappings(pud_t * pud,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)626 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 627 unsigned long end, unsigned long pfn, 628 pgprot_t prot) 629 { 630 pmd_t *pmd; 631 pte_t *pte; 632 unsigned long addr, next; 633 634 addr = start; 635 do { 636 pmd = pmd_offset(pud, addr); 637 638 BUG_ON(pmd_sect(*pmd)); 639 640 if (pmd_none(*pmd)) { 641 pte = pte_alloc_one_kernel(NULL); 642 if (!pte) { 643 kvm_err("Cannot allocate Hyp pte\n"); 644 return -ENOMEM; 645 } 646 kvm_pmd_populate(pmd, pte); 647 get_page(virt_to_page(pmd)); 648 } 649 650 next = pmd_addr_end(addr, end); 651 652 create_hyp_pte_mappings(pmd, addr, next, pfn, prot); 653 pfn += (next - addr) >> PAGE_SHIFT; 654 } while (addr = next, addr != end); 655 656 return 0; 657 } 658 create_hyp_pud_mappings(pgd_t * pgd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)659 static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, 660 unsigned long end, unsigned long pfn, 661 pgprot_t prot) 662 { 663 pud_t *pud; 664 pmd_t *pmd; 665 unsigned long addr, next; 666 int ret; 667 668 addr = start; 669 do { 670 pud = pud_offset(pgd, addr); 671 672 if (pud_none_or_clear_bad(pud)) { 673 pmd = pmd_alloc_one(NULL, addr); 674 if (!pmd) { 675 kvm_err("Cannot allocate Hyp pmd\n"); 676 return -ENOMEM; 677 } 678 kvm_pud_populate(pud, pmd); 679 get_page(virt_to_page(pud)); 680 } 681 682 next = pud_addr_end(addr, end); 683 ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); 684 if (ret) 685 return ret; 686 pfn += (next - addr) >> PAGE_SHIFT; 687 } while (addr = next, addr != end); 688 689 return 0; 690 } 691 __create_hyp_mappings(pgd_t * pgdp,unsigned long ptrs_per_pgd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)692 static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, 693 unsigned long start, unsigned long end, 694 unsigned long pfn, pgprot_t prot) 695 { 696 pgd_t *pgd; 697 pud_t *pud; 698 unsigned long addr, next; 699 int err = 0; 700 701 mutex_lock(&kvm_hyp_pgd_mutex); 702 addr = start & PAGE_MASK; 703 end = PAGE_ALIGN(end); 704 do { 705 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); 706 707 if (pgd_none(*pgd)) { 708 pud = pud_alloc_one(NULL, addr); 709 if (!pud) { 710 kvm_err("Cannot allocate Hyp pud\n"); 711 err = -ENOMEM; 712 goto out; 713 } 714 kvm_pgd_populate(pgd, pud); 715 get_page(virt_to_page(pgd)); 716 } 717 718 next = pgd_addr_end(addr, end); 719 err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot); 720 if (err) 721 goto out; 722 pfn += (next - addr) >> PAGE_SHIFT; 723 } while (addr = next, addr != end); 724 out: 725 mutex_unlock(&kvm_hyp_pgd_mutex); 726 return err; 727 } 728 kvm_kaddr_to_phys(void * kaddr)729 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 730 { 731 if (!is_vmalloc_addr(kaddr)) { 732 BUG_ON(!virt_addr_valid(kaddr)); 733 return __pa(kaddr); 734 } else { 735 return page_to_phys(vmalloc_to_page(kaddr)) + 736 offset_in_page(kaddr); 737 } 738 } 739 740 /** 741 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 742 * @from: The virtual kernel start address of the range 743 * @to: The virtual kernel end address of the range (exclusive) 744 * @prot: The protection to be applied to this range 745 * 746 * The same virtual address as the kernel virtual address is also used 747 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 748 * physical pages. 749 */ create_hyp_mappings(void * from,void * to,pgprot_t prot)750 int create_hyp_mappings(void *from, void *to, pgprot_t prot) 751 { 752 phys_addr_t phys_addr; 753 unsigned long virt_addr; 754 unsigned long start = kern_hyp_va((unsigned long)from); 755 unsigned long end = kern_hyp_va((unsigned long)to); 756 757 if (is_kernel_in_hyp_mode()) 758 return 0; 759 760 start = start & PAGE_MASK; 761 end = PAGE_ALIGN(end); 762 763 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 764 int err; 765 766 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 767 err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, 768 virt_addr, virt_addr + PAGE_SIZE, 769 __phys_to_pfn(phys_addr), 770 prot); 771 if (err) 772 return err; 773 } 774 775 return 0; 776 } 777 __create_hyp_private_mapping(phys_addr_t phys_addr,size_t size,unsigned long * haddr,pgprot_t prot)778 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 779 unsigned long *haddr, pgprot_t prot) 780 { 781 pgd_t *pgd = hyp_pgd; 782 unsigned long base; 783 int ret = 0; 784 785 mutex_lock(&kvm_hyp_pgd_mutex); 786 787 /* 788 * This assumes that we we have enough space below the idmap 789 * page to allocate our VAs. If not, the check below will 790 * kick. A potential alternative would be to detect that 791 * overflow and switch to an allocation above the idmap. 792 * 793 * The allocated size is always a multiple of PAGE_SIZE. 794 */ 795 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 796 base = io_map_base - size; 797 798 /* 799 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 800 * allocating the new area, as it would indicate we've 801 * overflowed the idmap/IO address range. 802 */ 803 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 804 ret = -ENOMEM; 805 else 806 io_map_base = base; 807 808 mutex_unlock(&kvm_hyp_pgd_mutex); 809 810 if (ret) 811 goto out; 812 813 if (__kvm_cpu_uses_extended_idmap()) 814 pgd = boot_hyp_pgd; 815 816 ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), 817 base, base + size, 818 __phys_to_pfn(phys_addr), prot); 819 if (ret) 820 goto out; 821 822 *haddr = base + offset_in_page(phys_addr); 823 824 out: 825 return ret; 826 } 827 828 /** 829 * create_hyp_io_mappings - Map IO into both kernel and HYP 830 * @phys_addr: The physical start address which gets mapped 831 * @size: Size of the region being mapped 832 * @kaddr: Kernel VA for this mapping 833 * @haddr: HYP VA for this mapping 834 */ create_hyp_io_mappings(phys_addr_t phys_addr,size_t size,void __iomem ** kaddr,void __iomem ** haddr)835 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 836 void __iomem **kaddr, 837 void __iomem **haddr) 838 { 839 unsigned long addr; 840 int ret; 841 842 *kaddr = ioremap(phys_addr, size); 843 if (!*kaddr) 844 return -ENOMEM; 845 846 if (is_kernel_in_hyp_mode()) { 847 *haddr = *kaddr; 848 return 0; 849 } 850 851 ret = __create_hyp_private_mapping(phys_addr, size, 852 &addr, PAGE_HYP_DEVICE); 853 if (ret) { 854 iounmap(*kaddr); 855 *kaddr = NULL; 856 *haddr = NULL; 857 return ret; 858 } 859 860 *haddr = (void __iomem *)addr; 861 return 0; 862 } 863 864 /** 865 * create_hyp_exec_mappings - Map an executable range into HYP 866 * @phys_addr: The physical start address which gets mapped 867 * @size: Size of the region being mapped 868 * @haddr: HYP VA for this mapping 869 */ create_hyp_exec_mappings(phys_addr_t phys_addr,size_t size,void ** haddr)870 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 871 void **haddr) 872 { 873 unsigned long addr; 874 int ret; 875 876 BUG_ON(is_kernel_in_hyp_mode()); 877 878 ret = __create_hyp_private_mapping(phys_addr, size, 879 &addr, PAGE_HYP_EXEC); 880 if (ret) { 881 *haddr = NULL; 882 return ret; 883 } 884 885 *haddr = (void *)addr; 886 return 0; 887 } 888 889 /** 890 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. 891 * @kvm: The KVM struct pointer for the VM. 892 * 893 * Allocates only the stage-2 HW PGD level table(s) of size defined by 894 * stage2_pgd_size(kvm). 895 * 896 * Note we don't need locking here as this is only called when the VM is 897 * created, which can only be done once. 898 */ kvm_alloc_stage2_pgd(struct kvm * kvm)899 int kvm_alloc_stage2_pgd(struct kvm *kvm) 900 { 901 phys_addr_t pgd_phys; 902 pgd_t *pgd; 903 904 if (kvm->arch.pgd != NULL) { 905 kvm_err("kvm_arch already initialized?\n"); 906 return -EINVAL; 907 } 908 909 /* Allocate the HW PGD, making sure that each page gets its own refcount */ 910 pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); 911 if (!pgd) 912 return -ENOMEM; 913 914 pgd_phys = virt_to_phys(pgd); 915 if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) 916 return -EINVAL; 917 918 kvm->arch.pgd = pgd; 919 kvm->arch.pgd_phys = pgd_phys; 920 return 0; 921 } 922 stage2_unmap_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)923 static void stage2_unmap_memslot(struct kvm *kvm, 924 struct kvm_memory_slot *memslot) 925 { 926 hva_t hva = memslot->userspace_addr; 927 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 928 phys_addr_t size = PAGE_SIZE * memslot->npages; 929 hva_t reg_end = hva + size; 930 931 /* 932 * A memory region could potentially cover multiple VMAs, and any holes 933 * between them, so iterate over all of them to find out if we should 934 * unmap any of them. 935 * 936 * +--------------------------------------------+ 937 * +---------------+----------------+ +----------------+ 938 * | : VMA 1 | VMA 2 | | VMA 3 : | 939 * +---------------+----------------+ +----------------+ 940 * | memory region | 941 * +--------------------------------------------+ 942 */ 943 do { 944 struct vm_area_struct *vma = find_vma(current->mm, hva); 945 hva_t vm_start, vm_end; 946 947 if (!vma || vma->vm_start >= reg_end) 948 break; 949 950 /* 951 * Take the intersection of this VMA with the memory region 952 */ 953 vm_start = max(hva, vma->vm_start); 954 vm_end = min(reg_end, vma->vm_end); 955 956 if (!(vma->vm_flags & VM_PFNMAP)) { 957 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 958 unmap_stage2_range(kvm, gpa, vm_end - vm_start); 959 } 960 hva = vm_end; 961 } while (hva < reg_end); 962 } 963 964 /** 965 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 966 * @kvm: The struct kvm pointer 967 * 968 * Go through the memregions and unmap any reguler RAM 969 * backing memory already mapped to the VM. 970 */ stage2_unmap_vm(struct kvm * kvm)971 void stage2_unmap_vm(struct kvm *kvm) 972 { 973 struct kvm_memslots *slots; 974 struct kvm_memory_slot *memslot; 975 int idx; 976 977 idx = srcu_read_lock(&kvm->srcu); 978 down_read(¤t->mm->mmap_sem); 979 spin_lock(&kvm->mmu_lock); 980 981 slots = kvm_memslots(kvm); 982 kvm_for_each_memslot(memslot, slots) 983 stage2_unmap_memslot(kvm, memslot); 984 985 spin_unlock(&kvm->mmu_lock); 986 up_read(¤t->mm->mmap_sem); 987 srcu_read_unlock(&kvm->srcu, idx); 988 } 989 990 /** 991 * kvm_free_stage2_pgd - free all stage-2 tables 992 * @kvm: The KVM struct pointer for the VM. 993 * 994 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all 995 * underlying level-2 and level-3 tables before freeing the actual level-1 table 996 * and setting the struct pointer to NULL. 997 */ kvm_free_stage2_pgd(struct kvm * kvm)998 void kvm_free_stage2_pgd(struct kvm *kvm) 999 { 1000 void *pgd = NULL; 1001 1002 spin_lock(&kvm->mmu_lock); 1003 if (kvm->arch.pgd) { 1004 unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); 1005 pgd = READ_ONCE(kvm->arch.pgd); 1006 kvm->arch.pgd = NULL; 1007 kvm->arch.pgd_phys = 0; 1008 } 1009 spin_unlock(&kvm->mmu_lock); 1010 1011 /* Free the HW pgd, one page at a time */ 1012 if (pgd) 1013 free_pages_exact(pgd, stage2_pgd_size(kvm)); 1014 } 1015 stage2_get_pud(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr)1016 static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1017 phys_addr_t addr) 1018 { 1019 pgd_t *pgd; 1020 pud_t *pud; 1021 1022 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 1023 if (stage2_pgd_none(kvm, *pgd)) { 1024 if (!cache) 1025 return NULL; 1026 pud = mmu_memory_cache_alloc(cache); 1027 stage2_pgd_populate(kvm, pgd, pud); 1028 get_page(virt_to_page(pgd)); 1029 } 1030 1031 return stage2_pud_offset(kvm, pgd, addr); 1032 } 1033 stage2_get_pmd(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr)1034 static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1035 phys_addr_t addr) 1036 { 1037 pud_t *pud; 1038 pmd_t *pmd; 1039 1040 pud = stage2_get_pud(kvm, cache, addr); 1041 if (!pud || stage2_pud_huge(kvm, *pud)) 1042 return NULL; 1043 1044 if (stage2_pud_none(kvm, *pud)) { 1045 if (!cache) 1046 return NULL; 1047 pmd = mmu_memory_cache_alloc(cache); 1048 stage2_pud_populate(kvm, pud, pmd); 1049 get_page(virt_to_page(pud)); 1050 } 1051 1052 return stage2_pmd_offset(kvm, pud, addr); 1053 } 1054 stage2_set_pmd_huge(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pmd_t * new_pmd)1055 static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache 1056 *cache, phys_addr_t addr, const pmd_t *new_pmd) 1057 { 1058 pmd_t *pmd, old_pmd; 1059 1060 retry: 1061 pmd = stage2_get_pmd(kvm, cache, addr); 1062 VM_BUG_ON(!pmd); 1063 1064 old_pmd = *pmd; 1065 /* 1066 * Multiple vcpus faulting on the same PMD entry, can 1067 * lead to them sequentially updating the PMD with the 1068 * same value. Following the break-before-make 1069 * (pmd_clear() followed by tlb_flush()) process can 1070 * hinder forward progress due to refaults generated 1071 * on missing translations. 1072 * 1073 * Skip updating the page table if the entry is 1074 * unchanged. 1075 */ 1076 if (pmd_val(old_pmd) == pmd_val(*new_pmd)) 1077 return 0; 1078 1079 if (pmd_present(old_pmd)) { 1080 /* 1081 * If we already have PTE level mapping for this block, 1082 * we must unmap it to avoid inconsistent TLB state and 1083 * leaking the table page. We could end up in this situation 1084 * if the memory slot was marked for dirty logging and was 1085 * reverted, leaving PTE level mappings for the pages accessed 1086 * during the period. So, unmap the PTE level mapping for this 1087 * block and retry, as we could have released the upper level 1088 * table in the process. 1089 * 1090 * Normal THP split/merge follows mmu_notifier callbacks and do 1091 * get handled accordingly. 1092 */ 1093 if (!pmd_thp_or_huge(old_pmd)) { 1094 unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); 1095 goto retry; 1096 } 1097 /* 1098 * Mapping in huge pages should only happen through a 1099 * fault. If a page is merged into a transparent huge 1100 * page, the individual subpages of that huge page 1101 * should be unmapped through MMU notifiers before we 1102 * get here. 1103 * 1104 * Merging of CompoundPages is not supported; they 1105 * should become splitting first, unmapped, merged, 1106 * and mapped back in on-demand. 1107 */ 1108 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); 1109 pmd_clear(pmd); 1110 kvm_tlb_flush_vmid_ipa(kvm, addr); 1111 } else { 1112 get_page(virt_to_page(pmd)); 1113 } 1114 1115 kvm_set_pmd(pmd, *new_pmd); 1116 return 0; 1117 } 1118 stage2_set_pud_huge(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pud_t * new_pudp)1119 static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1120 phys_addr_t addr, const pud_t *new_pudp) 1121 { 1122 pud_t *pudp, old_pud; 1123 1124 retry: 1125 pudp = stage2_get_pud(kvm, cache, addr); 1126 VM_BUG_ON(!pudp); 1127 1128 old_pud = *pudp; 1129 1130 /* 1131 * A large number of vcpus faulting on the same stage 2 entry, 1132 * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). 1133 * Skip updating the page tables if there is no change. 1134 */ 1135 if (pud_val(old_pud) == pud_val(*new_pudp)) 1136 return 0; 1137 1138 if (stage2_pud_present(kvm, old_pud)) { 1139 /* 1140 * If we already have table level mapping for this block, unmap 1141 * the range for this block and retry. 1142 */ 1143 if (!stage2_pud_huge(kvm, old_pud)) { 1144 unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); 1145 goto retry; 1146 } 1147 1148 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); 1149 stage2_pud_clear(kvm, pudp); 1150 kvm_tlb_flush_vmid_ipa(kvm, addr); 1151 } else { 1152 get_page(virt_to_page(pudp)); 1153 } 1154 1155 kvm_set_pud(pudp, *new_pudp); 1156 return 0; 1157 } 1158 1159 /* 1160 * stage2_get_leaf_entry - walk the stage2 VM page tables and return 1161 * true if a valid and present leaf-entry is found. A pointer to the 1162 * leaf-entry is returned in the appropriate level variable - pudpp, 1163 * pmdpp, ptepp. 1164 */ stage2_get_leaf_entry(struct kvm * kvm,phys_addr_t addr,pud_t ** pudpp,pmd_t ** pmdpp,pte_t ** ptepp)1165 static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, 1166 pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) 1167 { 1168 pud_t *pudp; 1169 pmd_t *pmdp; 1170 pte_t *ptep; 1171 1172 *pudpp = NULL; 1173 *pmdpp = NULL; 1174 *ptepp = NULL; 1175 1176 pudp = stage2_get_pud(kvm, NULL, addr); 1177 if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) 1178 return false; 1179 1180 if (stage2_pud_huge(kvm, *pudp)) { 1181 *pudpp = pudp; 1182 return true; 1183 } 1184 1185 pmdp = stage2_pmd_offset(kvm, pudp, addr); 1186 if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) 1187 return false; 1188 1189 if (pmd_thp_or_huge(*pmdp)) { 1190 *pmdpp = pmdp; 1191 return true; 1192 } 1193 1194 ptep = pte_offset_kernel(pmdp, addr); 1195 if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) 1196 return false; 1197 1198 *ptepp = ptep; 1199 return true; 1200 } 1201 stage2_is_exec(struct kvm * kvm,phys_addr_t addr)1202 static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) 1203 { 1204 pud_t *pudp; 1205 pmd_t *pmdp; 1206 pte_t *ptep; 1207 bool found; 1208 1209 found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); 1210 if (!found) 1211 return false; 1212 1213 if (pudp) 1214 return kvm_s2pud_exec(pudp); 1215 else if (pmdp) 1216 return kvm_s2pmd_exec(pmdp); 1217 else 1218 return kvm_s2pte_exec(ptep); 1219 } 1220 stage2_set_pte(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pte_t * new_pte,unsigned long flags)1221 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, 1222 phys_addr_t addr, const pte_t *new_pte, 1223 unsigned long flags) 1224 { 1225 pud_t *pud; 1226 pmd_t *pmd; 1227 pte_t *pte, old_pte; 1228 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; 1229 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; 1230 1231 VM_BUG_ON(logging_active && !cache); 1232 1233 /* Create stage-2 page table mapping - Levels 0 and 1 */ 1234 pud = stage2_get_pud(kvm, cache, addr); 1235 if (!pud) { 1236 /* 1237 * Ignore calls from kvm_set_spte_hva for unallocated 1238 * address ranges. 1239 */ 1240 return 0; 1241 } 1242 1243 /* 1244 * While dirty page logging - dissolve huge PUD, then continue 1245 * on to allocate page. 1246 */ 1247 if (logging_active) 1248 stage2_dissolve_pud(kvm, addr, pud); 1249 1250 if (stage2_pud_none(kvm, *pud)) { 1251 if (!cache) 1252 return 0; /* ignore calls from kvm_set_spte_hva */ 1253 pmd = mmu_memory_cache_alloc(cache); 1254 stage2_pud_populate(kvm, pud, pmd); 1255 get_page(virt_to_page(pud)); 1256 } 1257 1258 pmd = stage2_pmd_offset(kvm, pud, addr); 1259 if (!pmd) { 1260 /* 1261 * Ignore calls from kvm_set_spte_hva for unallocated 1262 * address ranges. 1263 */ 1264 return 0; 1265 } 1266 1267 /* 1268 * While dirty page logging - dissolve huge PMD, then continue on to 1269 * allocate page. 1270 */ 1271 if (logging_active) 1272 stage2_dissolve_pmd(kvm, addr, pmd); 1273 1274 /* Create stage-2 page mappings - Level 2 */ 1275 if (pmd_none(*pmd)) { 1276 if (!cache) 1277 return 0; /* ignore calls from kvm_set_spte_hva */ 1278 pte = mmu_memory_cache_alloc(cache); 1279 kvm_pmd_populate(pmd, pte); 1280 get_page(virt_to_page(pmd)); 1281 } 1282 1283 pte = pte_offset_kernel(pmd, addr); 1284 1285 if (iomap && pte_present(*pte)) 1286 return -EFAULT; 1287 1288 /* Create 2nd stage page table mapping - Level 3 */ 1289 old_pte = *pte; 1290 if (pte_present(old_pte)) { 1291 /* Skip page table update if there is no change */ 1292 if (pte_val(old_pte) == pte_val(*new_pte)) 1293 return 0; 1294 1295 kvm_set_pte(pte, __pte(0)); 1296 kvm_tlb_flush_vmid_ipa(kvm, addr); 1297 } else { 1298 get_page(virt_to_page(pte)); 1299 } 1300 1301 kvm_set_pte(pte, *new_pte); 1302 return 0; 1303 } 1304 1305 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG stage2_ptep_test_and_clear_young(pte_t * pte)1306 static int stage2_ptep_test_and_clear_young(pte_t *pte) 1307 { 1308 if (pte_young(*pte)) { 1309 *pte = pte_mkold(*pte); 1310 return 1; 1311 } 1312 return 0; 1313 } 1314 #else stage2_ptep_test_and_clear_young(pte_t * pte)1315 static int stage2_ptep_test_and_clear_young(pte_t *pte) 1316 { 1317 return __ptep_test_and_clear_young(pte); 1318 } 1319 #endif 1320 stage2_pmdp_test_and_clear_young(pmd_t * pmd)1321 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) 1322 { 1323 return stage2_ptep_test_and_clear_young((pte_t *)pmd); 1324 } 1325 stage2_pudp_test_and_clear_young(pud_t * pud)1326 static int stage2_pudp_test_and_clear_young(pud_t *pud) 1327 { 1328 return stage2_ptep_test_and_clear_young((pte_t *)pud); 1329 } 1330 1331 /** 1332 * kvm_phys_addr_ioremap - map a device range to guest IPA 1333 * 1334 * @kvm: The KVM pointer 1335 * @guest_ipa: The IPA at which to insert the mapping 1336 * @pa: The physical address of the device 1337 * @size: The size of the mapping 1338 */ kvm_phys_addr_ioremap(struct kvm * kvm,phys_addr_t guest_ipa,phys_addr_t pa,unsigned long size,bool writable)1339 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1340 phys_addr_t pa, unsigned long size, bool writable) 1341 { 1342 phys_addr_t addr, end; 1343 int ret = 0; 1344 unsigned long pfn; 1345 struct kvm_mmu_memory_cache cache = { 0, }; 1346 1347 end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; 1348 pfn = __phys_to_pfn(pa); 1349 1350 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { 1351 pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); 1352 1353 if (writable) 1354 pte = kvm_s2pte_mkwrite(pte); 1355 1356 ret = mmu_topup_memory_cache(&cache, 1357 kvm_mmu_cache_min_pages(kvm), 1358 KVM_NR_MEM_OBJS); 1359 if (ret) 1360 goto out; 1361 spin_lock(&kvm->mmu_lock); 1362 ret = stage2_set_pte(kvm, &cache, addr, &pte, 1363 KVM_S2PTE_FLAG_IS_IOMAP); 1364 spin_unlock(&kvm->mmu_lock); 1365 if (ret) 1366 goto out; 1367 1368 pfn++; 1369 } 1370 1371 out: 1372 mmu_free_memory_cache(&cache); 1373 return ret; 1374 } 1375 transparent_hugepage_adjust(kvm_pfn_t * pfnp,phys_addr_t * ipap)1376 static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) 1377 { 1378 kvm_pfn_t pfn = *pfnp; 1379 gfn_t gfn = *ipap >> PAGE_SHIFT; 1380 struct page *page = pfn_to_page(pfn); 1381 1382 /* 1383 * PageTransCompoundMap() returns true for THP and 1384 * hugetlbfs. Make sure the adjustment is done only for THP 1385 * pages. 1386 */ 1387 if (!PageHuge(page) && PageTransCompoundMap(page)) { 1388 unsigned long mask; 1389 /* 1390 * The address we faulted on is backed by a transparent huge 1391 * page. However, because we map the compound huge page and 1392 * not the individual tail page, we need to transfer the 1393 * refcount to the head page. We have to be careful that the 1394 * THP doesn't start to split while we are adjusting the 1395 * refcounts. 1396 * 1397 * We are sure this doesn't happen, because mmu_notifier_retry 1398 * was successful and we are holding the mmu_lock, so if this 1399 * THP is trying to split, it will be blocked in the mmu 1400 * notifier before touching any of the pages, specifically 1401 * before being able to call __split_huge_page_refcount(). 1402 * 1403 * We can therefore safely transfer the refcount from PG_tail 1404 * to PG_head and switch the pfn from a tail page to the head 1405 * page accordingly. 1406 */ 1407 mask = PTRS_PER_PMD - 1; 1408 VM_BUG_ON((gfn & mask) != (pfn & mask)); 1409 if (pfn & mask) { 1410 *ipap &= PMD_MASK; 1411 kvm_release_pfn_clean(pfn); 1412 pfn &= ~mask; 1413 kvm_get_pfn(pfn); 1414 *pfnp = pfn; 1415 } 1416 1417 return true; 1418 } 1419 1420 return false; 1421 } 1422 1423 /** 1424 * stage2_wp_ptes - write protect PMD range 1425 * @pmd: pointer to pmd entry 1426 * @addr: range start address 1427 * @end: range end address 1428 */ stage2_wp_ptes(pmd_t * pmd,phys_addr_t addr,phys_addr_t end)1429 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) 1430 { 1431 pte_t *pte; 1432 1433 pte = pte_offset_kernel(pmd, addr); 1434 do { 1435 if (!pte_none(*pte)) { 1436 if (!kvm_s2pte_readonly(pte)) 1437 kvm_set_s2pte_readonly(pte); 1438 } 1439 } while (pte++, addr += PAGE_SIZE, addr != end); 1440 } 1441 1442 /** 1443 * stage2_wp_pmds - write protect PUD range 1444 * kvm: kvm instance for the VM 1445 * @pud: pointer to pud entry 1446 * @addr: range start address 1447 * @end: range end address 1448 */ stage2_wp_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)1449 static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, 1450 phys_addr_t addr, phys_addr_t end) 1451 { 1452 pmd_t *pmd; 1453 phys_addr_t next; 1454 1455 pmd = stage2_pmd_offset(kvm, pud, addr); 1456 1457 do { 1458 next = stage2_pmd_addr_end(kvm, addr, end); 1459 if (!pmd_none(*pmd)) { 1460 if (pmd_thp_or_huge(*pmd)) { 1461 if (!kvm_s2pmd_readonly(pmd)) 1462 kvm_set_s2pmd_readonly(pmd); 1463 } else { 1464 stage2_wp_ptes(pmd, addr, next); 1465 } 1466 } 1467 } while (pmd++, addr = next, addr != end); 1468 } 1469 1470 /** 1471 * stage2_wp_puds - write protect PGD range 1472 * @pgd: pointer to pgd entry 1473 * @addr: range start address 1474 * @end: range end address 1475 */ stage2_wp_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)1476 static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, 1477 phys_addr_t addr, phys_addr_t end) 1478 { 1479 pud_t *pud; 1480 phys_addr_t next; 1481 1482 pud = stage2_pud_offset(kvm, pgd, addr); 1483 do { 1484 next = stage2_pud_addr_end(kvm, addr, end); 1485 if (!stage2_pud_none(kvm, *pud)) { 1486 if (stage2_pud_huge(kvm, *pud)) { 1487 if (!kvm_s2pud_readonly(pud)) 1488 kvm_set_s2pud_readonly(pud); 1489 } else { 1490 stage2_wp_pmds(kvm, pud, addr, next); 1491 } 1492 } 1493 } while (pud++, addr = next, addr != end); 1494 } 1495 1496 /** 1497 * stage2_wp_range() - write protect stage2 memory region range 1498 * @kvm: The KVM pointer 1499 * @addr: Start address of range 1500 * @end: End address of range 1501 */ stage2_wp_range(struct kvm * kvm,phys_addr_t addr,phys_addr_t end)1502 static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) 1503 { 1504 pgd_t *pgd; 1505 phys_addr_t next; 1506 1507 pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); 1508 do { 1509 /* 1510 * Release kvm_mmu_lock periodically if the memory region is 1511 * large. Otherwise, we may see kernel panics with 1512 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, 1513 * CONFIG_LOCKDEP. Additionally, holding the lock too long 1514 * will also starve other vCPUs. We have to also make sure 1515 * that the page tables are not freed while we released 1516 * the lock. 1517 */ 1518 cond_resched_lock(&kvm->mmu_lock); 1519 if (!READ_ONCE(kvm->arch.pgd)) 1520 break; 1521 next = stage2_pgd_addr_end(kvm, addr, end); 1522 if (stage2_pgd_present(kvm, *pgd)) 1523 stage2_wp_puds(kvm, pgd, addr, next); 1524 } while (pgd++, addr = next, addr != end); 1525 } 1526 1527 /** 1528 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1529 * @kvm: The KVM pointer 1530 * @slot: The memory slot to write protect 1531 * 1532 * Called to start logging dirty pages after memory region 1533 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1534 * all present PUD, PMD and PTEs are write protected in the memory region. 1535 * Afterwards read of dirty page log can be called. 1536 * 1537 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1538 * serializing operations for VM memory regions. 1539 */ kvm_mmu_wp_memory_region(struct kvm * kvm,int slot)1540 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1541 { 1542 struct kvm_memslots *slots = kvm_memslots(kvm); 1543 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1544 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; 1545 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1546 1547 spin_lock(&kvm->mmu_lock); 1548 stage2_wp_range(kvm, start, end); 1549 spin_unlock(&kvm->mmu_lock); 1550 kvm_flush_remote_tlbs(kvm); 1551 } 1552 1553 /** 1554 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages 1555 * @kvm: The KVM pointer 1556 * @slot: The memory slot associated with mask 1557 * @gfn_offset: The gfn offset in memory slot 1558 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 1559 * slot to be write protected 1560 * 1561 * Walks bits set in mask write protects the associated pte's. Caller must 1562 * acquire kvm_mmu_lock. 1563 */ kvm_mmu_write_protect_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)1564 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1565 struct kvm_memory_slot *slot, 1566 gfn_t gfn_offset, unsigned long mask) 1567 { 1568 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1569 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1570 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1571 1572 stage2_wp_range(kvm, start, end); 1573 } 1574 1575 /* 1576 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1577 * dirty pages. 1578 * 1579 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1580 * enable dirty logging for them. 1581 */ kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)1582 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1583 struct kvm_memory_slot *slot, 1584 gfn_t gfn_offset, unsigned long mask) 1585 { 1586 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1587 } 1588 clean_dcache_guest_page(kvm_pfn_t pfn,unsigned long size)1589 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) 1590 { 1591 __clean_dcache_guest_page(pfn, size); 1592 } 1593 invalidate_icache_guest_page(kvm_pfn_t pfn,unsigned long size)1594 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) 1595 { 1596 __invalidate_icache_guest_page(pfn, size); 1597 } 1598 kvm_send_hwpoison_signal(unsigned long address,struct vm_area_struct * vma)1599 static void kvm_send_hwpoison_signal(unsigned long address, 1600 struct vm_area_struct *vma) 1601 { 1602 short lsb; 1603 1604 if (is_vm_hugetlb_page(vma)) 1605 lsb = huge_page_shift(hstate_vma(vma)); 1606 else 1607 lsb = PAGE_SHIFT; 1608 1609 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1610 } 1611 fault_supports_stage2_huge_mapping(struct kvm_memory_slot * memslot,unsigned long hva,unsigned long map_size)1612 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1613 unsigned long hva, 1614 unsigned long map_size) 1615 { 1616 gpa_t gpa_start; 1617 hva_t uaddr_start, uaddr_end; 1618 size_t size; 1619 1620 size = memslot->npages * PAGE_SIZE; 1621 1622 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1623 1624 uaddr_start = memslot->userspace_addr; 1625 uaddr_end = uaddr_start + size; 1626 1627 /* 1628 * Pages belonging to memslots that don't have the same alignment 1629 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1630 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1631 * 1632 * Consider a layout like the following: 1633 * 1634 * memslot->userspace_addr: 1635 * +-----+--------------------+--------------------+---+ 1636 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1637 * +-----+--------------------+--------------------+---+ 1638 * 1639 * memslot->base_gfn << PAGE_SIZE: 1640 * +---+--------------------+--------------------+-----+ 1641 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1642 * +---+--------------------+--------------------+-----+ 1643 * 1644 * If we create those stage-2 blocks, we'll end up with this incorrect 1645 * mapping: 1646 * d -> f 1647 * e -> g 1648 * f -> h 1649 */ 1650 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1651 return false; 1652 1653 /* 1654 * Next, let's make sure we're not trying to map anything not covered 1655 * by the memslot. This means we have to prohibit block size mappings 1656 * for the beginning and end of a non-block aligned and non-block sized 1657 * memory slot (illustrated by the head and tail parts of the 1658 * userspace view above containing pages 'abcde' and 'xyz', 1659 * respectively). 1660 * 1661 * Note that it doesn't matter if we do the check using the 1662 * userspace_addr or the base_gfn, as both are equally aligned (per 1663 * the check above) and equally sized. 1664 */ 1665 return (hva & ~(map_size - 1)) >= uaddr_start && 1666 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1667 } 1668 user_mem_abort(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,struct kvm_memory_slot * memslot,unsigned long hva,unsigned long fault_status)1669 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1670 struct kvm_memory_slot *memslot, unsigned long hva, 1671 unsigned long fault_status) 1672 { 1673 int ret; 1674 bool write_fault, writable, force_pte = false; 1675 bool exec_fault, needs_exec; 1676 unsigned long mmu_seq; 1677 gfn_t gfn = fault_ipa >> PAGE_SHIFT; 1678 struct kvm *kvm = vcpu->kvm; 1679 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; 1680 struct vm_area_struct *vma; 1681 kvm_pfn_t pfn; 1682 pgprot_t mem_type = PAGE_S2; 1683 bool logging_active = memslot_is_logging(memslot); 1684 unsigned long vma_pagesize, flags = 0; 1685 1686 write_fault = kvm_is_write_fault(vcpu); 1687 exec_fault = kvm_vcpu_trap_is_iabt(vcpu); 1688 VM_BUG_ON(write_fault && exec_fault); 1689 1690 if (fault_status == FSC_PERM && !write_fault && !exec_fault) { 1691 kvm_err("Unexpected L2 read permission error\n"); 1692 return -EFAULT; 1693 } 1694 1695 /* Let's check if we will get back a huge page backed by hugetlbfs */ 1696 down_read(¤t->mm->mmap_sem); 1697 vma = find_vma_intersection(current->mm, hva, hva + 1); 1698 if (unlikely(!vma)) { 1699 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1700 up_read(¤t->mm->mmap_sem); 1701 return -EFAULT; 1702 } 1703 1704 vma_pagesize = vma_kernel_pagesize(vma); 1705 if (logging_active || 1706 (vma->vm_flags & VM_PFNMAP) || 1707 !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { 1708 force_pte = true; 1709 vma_pagesize = PAGE_SIZE; 1710 } 1711 1712 /* 1713 * The stage2 has a minimum of 2 level table (For arm64 see 1714 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can 1715 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD). 1716 * As for PUD huge maps, we must make sure that we have at least 1717 * 3 levels, i.e, PMD is not folded. 1718 */ 1719 if (vma_pagesize == PMD_SIZE || 1720 (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) 1721 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; 1722 up_read(¤t->mm->mmap_sem); 1723 1724 /* We need minimum second+third level pages */ 1725 ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), 1726 KVM_NR_MEM_OBJS); 1727 if (ret) 1728 return ret; 1729 1730 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1731 /* 1732 * Ensure the read of mmu_notifier_seq happens before we call 1733 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk 1734 * the page we just got a reference to gets unmapped before we have a 1735 * chance to grab the mmu_lock, which ensure that if the page gets 1736 * unmapped afterwards, the call to kvm_unmap_hva will take it away 1737 * from us again properly. This smp_rmb() interacts with the smp_wmb() 1738 * in kvm_mmu_notifier_invalidate_<page|range_end>. 1739 */ 1740 smp_rmb(); 1741 1742 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 1743 if (pfn == KVM_PFN_ERR_HWPOISON) { 1744 kvm_send_hwpoison_signal(hva, vma); 1745 return 0; 1746 } 1747 if (is_error_noslot_pfn(pfn)) 1748 return -EFAULT; 1749 1750 if (kvm_is_device_pfn(pfn)) { 1751 mem_type = PAGE_S2_DEVICE; 1752 flags |= KVM_S2PTE_FLAG_IS_IOMAP; 1753 } else if (logging_active) { 1754 /* 1755 * Faults on pages in a memslot with logging enabled 1756 * should not be mapped with huge pages (it introduces churn 1757 * and performance degradation), so force a pte mapping. 1758 */ 1759 flags |= KVM_S2_FLAG_LOGGING_ACTIVE; 1760 1761 /* 1762 * Only actually map the page as writable if this was a write 1763 * fault. 1764 */ 1765 if (!write_fault) 1766 writable = false; 1767 } 1768 1769 if (exec_fault && is_iomap(flags)) 1770 return -ENOEXEC; 1771 1772 spin_lock(&kvm->mmu_lock); 1773 if (mmu_notifier_retry(kvm, mmu_seq)) 1774 goto out_unlock; 1775 1776 if (vma_pagesize == PAGE_SIZE && !force_pte) { 1777 /* 1778 * Only PMD_SIZE transparent hugepages(THP) are 1779 * currently supported. This code will need to be 1780 * updated to support other THP sizes. 1781 * 1782 * Make sure the host VA and the guest IPA are sufficiently 1783 * aligned and that the block is contained within the memslot. 1784 */ 1785 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && 1786 transparent_hugepage_adjust(&pfn, &fault_ipa)) 1787 vma_pagesize = PMD_SIZE; 1788 } 1789 1790 if (writable) 1791 kvm_set_pfn_dirty(pfn); 1792 1793 if (fault_status != FSC_PERM && !is_iomap(flags)) 1794 clean_dcache_guest_page(pfn, vma_pagesize); 1795 1796 if (exec_fault) 1797 invalidate_icache_guest_page(pfn, vma_pagesize); 1798 1799 /* 1800 * If we took an execution fault we have made the 1801 * icache/dcache coherent above and should now let the s2 1802 * mapping be executable. 1803 * 1804 * Write faults (!exec_fault && FSC_PERM) are orthogonal to 1805 * execute permissions, and we preserve whatever we have. 1806 */ 1807 needs_exec = exec_fault || 1808 (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); 1809 1810 if (vma_pagesize == PUD_SIZE) { 1811 pud_t new_pud = kvm_pfn_pud(pfn, mem_type); 1812 1813 new_pud = kvm_pud_mkhuge(new_pud); 1814 if (writable) 1815 new_pud = kvm_s2pud_mkwrite(new_pud); 1816 1817 if (needs_exec) 1818 new_pud = kvm_s2pud_mkexec(new_pud); 1819 1820 ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); 1821 } else if (vma_pagesize == PMD_SIZE) { 1822 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); 1823 1824 new_pmd = kvm_pmd_mkhuge(new_pmd); 1825 1826 if (writable) 1827 new_pmd = kvm_s2pmd_mkwrite(new_pmd); 1828 1829 if (needs_exec) 1830 new_pmd = kvm_s2pmd_mkexec(new_pmd); 1831 1832 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 1833 } else { 1834 pte_t new_pte = kvm_pfn_pte(pfn, mem_type); 1835 1836 if (writable) { 1837 new_pte = kvm_s2pte_mkwrite(new_pte); 1838 mark_page_dirty(kvm, gfn); 1839 } 1840 1841 if (needs_exec) 1842 new_pte = kvm_s2pte_mkexec(new_pte); 1843 1844 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); 1845 } 1846 1847 out_unlock: 1848 spin_unlock(&kvm->mmu_lock); 1849 kvm_set_pfn_accessed(pfn); 1850 kvm_release_pfn_clean(pfn); 1851 return ret; 1852 } 1853 1854 /* 1855 * Resolve the access fault by making the page young again. 1856 * Note that because the faulting entry is guaranteed not to be 1857 * cached in the TLB, we don't need to invalidate anything. 1858 * Only the HW Access Flag updates are supported for Stage 2 (no DBM), 1859 * so there is no need for atomic (pte|pmd)_mkyoung operations. 1860 */ handle_access_fault(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa)1861 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1862 { 1863 pud_t *pud; 1864 pmd_t *pmd; 1865 pte_t *pte; 1866 kvm_pfn_t pfn; 1867 bool pfn_valid = false; 1868 1869 trace_kvm_access_fault(fault_ipa); 1870 1871 spin_lock(&vcpu->kvm->mmu_lock); 1872 1873 if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) 1874 goto out; 1875 1876 if (pud) { /* HugeTLB */ 1877 *pud = kvm_s2pud_mkyoung(*pud); 1878 pfn = kvm_pud_pfn(*pud); 1879 pfn_valid = true; 1880 } else if (pmd) { /* THP, HugeTLB */ 1881 *pmd = pmd_mkyoung(*pmd); 1882 pfn = pmd_pfn(*pmd); 1883 pfn_valid = true; 1884 } else { 1885 *pte = pte_mkyoung(*pte); /* Just a page... */ 1886 pfn = pte_pfn(*pte); 1887 pfn_valid = true; 1888 } 1889 1890 out: 1891 spin_unlock(&vcpu->kvm->mmu_lock); 1892 if (pfn_valid) 1893 kvm_set_pfn_accessed(pfn); 1894 } 1895 1896 /** 1897 * kvm_handle_guest_abort - handles all 2nd stage aborts 1898 * @vcpu: the VCPU pointer 1899 * @run: the kvm_run structure 1900 * 1901 * Any abort that gets to the host is almost guaranteed to be caused by a 1902 * missing second stage translation table entry, which can mean that either the 1903 * guest simply needs more memory and we must allocate an appropriate page or it 1904 * can mean that the guest tried to access I/O memory, which is emulated by user 1905 * space. The distinction is based on the IPA causing the fault and whether this 1906 * memory region has been registered as standard RAM by user space. 1907 */ kvm_handle_guest_abort(struct kvm_vcpu * vcpu,struct kvm_run * run)1908 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) 1909 { 1910 unsigned long fault_status; 1911 phys_addr_t fault_ipa; 1912 struct kvm_memory_slot *memslot; 1913 unsigned long hva; 1914 bool is_iabt, write_fault, writable; 1915 gfn_t gfn; 1916 int ret, idx; 1917 1918 fault_status = kvm_vcpu_trap_get_fault_type(vcpu); 1919 1920 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1921 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1922 1923 /* Synchronous External Abort? */ 1924 if (kvm_vcpu_dabt_isextabt(vcpu)) { 1925 /* 1926 * For RAS the host kernel may handle this abort. 1927 * There is no need to pass the error into the guest. 1928 */ 1929 if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) 1930 return 1; 1931 1932 if (unlikely(!is_iabt)) { 1933 kvm_inject_vabt(vcpu); 1934 return 1; 1935 } 1936 } 1937 1938 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), 1939 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1940 1941 /* Check the stage-2 fault is trans. fault or write fault */ 1942 if (fault_status != FSC_FAULT && fault_status != FSC_PERM && 1943 fault_status != FSC_ACCESS) { 1944 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1945 kvm_vcpu_trap_get_class(vcpu), 1946 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1947 (unsigned long)kvm_vcpu_get_hsr(vcpu)); 1948 return -EFAULT; 1949 } 1950 1951 idx = srcu_read_lock(&vcpu->kvm->srcu); 1952 1953 gfn = fault_ipa >> PAGE_SHIFT; 1954 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1955 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1956 write_fault = kvm_is_write_fault(vcpu); 1957 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1958 if (is_iabt) { 1959 /* Prefetch Abort on I/O address */ 1960 ret = -ENOEXEC; 1961 goto out; 1962 } 1963 1964 /* 1965 * Check for a cache maintenance operation. Since we 1966 * ended-up here, we know it is outside of any memory 1967 * slot. But we can't find out if that is for a device, 1968 * or if the guest is just being stupid. The only thing 1969 * we know for sure is that this range cannot be cached. 1970 * 1971 * So let's assume that the guest is just being 1972 * cautious, and skip the instruction. 1973 */ 1974 if (kvm_vcpu_dabt_is_cm(vcpu)) { 1975 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 1976 ret = 1; 1977 goto out_unlock; 1978 } 1979 1980 /* 1981 * The IPA is reported as [MAX:12], so we need to 1982 * complement it with the bottom 12 bits from the 1983 * faulting VA. This is always 12 bits, irrespective 1984 * of the page size. 1985 */ 1986 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); 1987 ret = io_mem_abort(vcpu, run, fault_ipa); 1988 goto out_unlock; 1989 } 1990 1991 /* Userspace should not be able to register out-of-bounds IPAs */ 1992 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); 1993 1994 if (fault_status == FSC_ACCESS) { 1995 handle_access_fault(vcpu, fault_ipa); 1996 ret = 1; 1997 goto out_unlock; 1998 } 1999 2000 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); 2001 if (ret == 0) 2002 ret = 1; 2003 out: 2004 if (ret == -ENOEXEC) { 2005 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 2006 ret = 1; 2007 } 2008 out_unlock: 2009 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2010 return ret; 2011 } 2012 handle_hva_to_gpa(struct kvm * kvm,unsigned long start,unsigned long end,int (* handler)(struct kvm * kvm,gpa_t gpa,u64 size,void * data),void * data)2013 static int handle_hva_to_gpa(struct kvm *kvm, 2014 unsigned long start, 2015 unsigned long end, 2016 int (*handler)(struct kvm *kvm, 2017 gpa_t gpa, u64 size, 2018 void *data), 2019 void *data) 2020 { 2021 struct kvm_memslots *slots; 2022 struct kvm_memory_slot *memslot; 2023 int ret = 0; 2024 2025 slots = kvm_memslots(kvm); 2026 2027 /* we only care about the pages that the guest sees */ 2028 kvm_for_each_memslot(memslot, slots) { 2029 unsigned long hva_start, hva_end; 2030 gfn_t gpa; 2031 2032 hva_start = max(start, memslot->userspace_addr); 2033 hva_end = min(end, memslot->userspace_addr + 2034 (memslot->npages << PAGE_SHIFT)); 2035 if (hva_start >= hva_end) 2036 continue; 2037 2038 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; 2039 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); 2040 } 2041 2042 return ret; 2043 } 2044 kvm_unmap_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2045 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2046 { 2047 unmap_stage2_range(kvm, gpa, size); 2048 return 0; 2049 } 2050 kvm_unmap_hva_range(struct kvm * kvm,unsigned long start,unsigned long end)2051 int kvm_unmap_hva_range(struct kvm *kvm, 2052 unsigned long start, unsigned long end) 2053 { 2054 if (!kvm->arch.pgd) 2055 return 0; 2056 2057 trace_kvm_unmap_hva_range(start, end); 2058 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); 2059 return 0; 2060 } 2061 kvm_set_spte_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2062 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2063 { 2064 pte_t *pte = (pte_t *)data; 2065 2066 WARN_ON(size != PAGE_SIZE); 2067 /* 2068 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE 2069 * flag clear because MMU notifiers will have unmapped a huge PMD before 2070 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and 2071 * therefore stage2_set_pte() never needs to clear out a huge PMD 2072 * through this calling path. 2073 */ 2074 stage2_set_pte(kvm, NULL, gpa, pte, 0); 2075 return 0; 2076 } 2077 2078 kvm_set_spte_hva(struct kvm * kvm,unsigned long hva,pte_t pte)2079 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 2080 { 2081 unsigned long end = hva + PAGE_SIZE; 2082 kvm_pfn_t pfn = pte_pfn(pte); 2083 pte_t stage2_pte; 2084 2085 if (!kvm->arch.pgd) 2086 return 0; 2087 2088 trace_kvm_set_spte_hva(hva); 2089 2090 /* 2091 * We've moved a page around, probably through CoW, so let's treat it 2092 * just like a translation fault and clean the cache to the PoC. 2093 */ 2094 clean_dcache_guest_page(pfn, PAGE_SIZE); 2095 stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); 2096 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); 2097 2098 return 0; 2099 } 2100 kvm_age_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2101 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2102 { 2103 pud_t *pud; 2104 pmd_t *pmd; 2105 pte_t *pte; 2106 2107 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); 2108 if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) 2109 return 0; 2110 2111 if (pud) 2112 return stage2_pudp_test_and_clear_young(pud); 2113 else if (pmd) 2114 return stage2_pmdp_test_and_clear_young(pmd); 2115 else 2116 return stage2_ptep_test_and_clear_young(pte); 2117 } 2118 kvm_test_age_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2119 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) 2120 { 2121 pud_t *pud; 2122 pmd_t *pmd; 2123 pte_t *pte; 2124 2125 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); 2126 if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) 2127 return 0; 2128 2129 if (pud) 2130 return kvm_s2pud_young(*pud); 2131 else if (pmd) 2132 return pmd_young(*pmd); 2133 else 2134 return pte_young(*pte); 2135 } 2136 kvm_age_hva(struct kvm * kvm,unsigned long start,unsigned long end)2137 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 2138 { 2139 if (!kvm->arch.pgd) 2140 return 0; 2141 trace_kvm_age_hva(start, end); 2142 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); 2143 } 2144 kvm_test_age_hva(struct kvm * kvm,unsigned long hva)2145 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 2146 { 2147 if (!kvm->arch.pgd) 2148 return 0; 2149 trace_kvm_test_age_hva(hva); 2150 return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); 2151 } 2152 kvm_mmu_free_memory_caches(struct kvm_vcpu * vcpu)2153 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) 2154 { 2155 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); 2156 } 2157 kvm_mmu_get_httbr(void)2158 phys_addr_t kvm_mmu_get_httbr(void) 2159 { 2160 if (__kvm_cpu_uses_extended_idmap()) 2161 return virt_to_phys(merged_hyp_pgd); 2162 else 2163 return virt_to_phys(hyp_pgd); 2164 } 2165 kvm_get_idmap_vector(void)2166 phys_addr_t kvm_get_idmap_vector(void) 2167 { 2168 return hyp_idmap_vector; 2169 } 2170 kvm_map_idmap_text(pgd_t * pgd)2171 static int kvm_map_idmap_text(pgd_t *pgd) 2172 { 2173 int err; 2174 2175 /* Create the idmap in the boot page tables */ 2176 err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), 2177 hyp_idmap_start, hyp_idmap_end, 2178 __phys_to_pfn(hyp_idmap_start), 2179 PAGE_HYP_EXEC); 2180 if (err) 2181 kvm_err("Failed to idmap %lx-%lx\n", 2182 hyp_idmap_start, hyp_idmap_end); 2183 2184 return err; 2185 } 2186 kvm_mmu_init(void)2187 int kvm_mmu_init(void) 2188 { 2189 int err; 2190 2191 hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); 2192 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2193 hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); 2194 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2195 hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); 2196 2197 /* 2198 * We rely on the linker script to ensure at build time that the HYP 2199 * init code does not cross a page boundary. 2200 */ 2201 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2202 2203 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2204 kvm_debug("HYP VA range: %lx:%lx\n", 2205 kern_hyp_va(PAGE_OFFSET), 2206 kern_hyp_va((unsigned long)high_memory - 1)); 2207 2208 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2209 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2210 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2211 /* 2212 * The idmap page is intersecting with the VA space, 2213 * it is not safe to continue further. 2214 */ 2215 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2216 err = -EINVAL; 2217 goto out; 2218 } 2219 2220 hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); 2221 if (!hyp_pgd) { 2222 kvm_err("Hyp mode PGD not allocated\n"); 2223 err = -ENOMEM; 2224 goto out; 2225 } 2226 2227 if (__kvm_cpu_uses_extended_idmap()) { 2228 boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 2229 hyp_pgd_order); 2230 if (!boot_hyp_pgd) { 2231 kvm_err("Hyp boot PGD not allocated\n"); 2232 err = -ENOMEM; 2233 goto out; 2234 } 2235 2236 err = kvm_map_idmap_text(boot_hyp_pgd); 2237 if (err) 2238 goto out; 2239 2240 merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 2241 if (!merged_hyp_pgd) { 2242 kvm_err("Failed to allocate extra HYP pgd\n"); 2243 goto out; 2244 } 2245 __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, 2246 hyp_idmap_start); 2247 } else { 2248 err = kvm_map_idmap_text(hyp_pgd); 2249 if (err) 2250 goto out; 2251 } 2252 2253 io_map_base = hyp_idmap_start; 2254 return 0; 2255 out: 2256 free_hyp_pgds(); 2257 return err; 2258 } 2259 kvm_arch_commit_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region * mem,const struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)2260 void kvm_arch_commit_memory_region(struct kvm *kvm, 2261 const struct kvm_userspace_memory_region *mem, 2262 const struct kvm_memory_slot *old, 2263 const struct kvm_memory_slot *new, 2264 enum kvm_mr_change change) 2265 { 2266 /* 2267 * At this point memslot has been committed and there is an 2268 * allocated dirty_bitmap[], dirty pages will be be tracked while the 2269 * memory slot is write protected. 2270 */ 2271 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) 2272 kvm_mmu_wp_memory_region(kvm, mem->slot); 2273 } 2274 kvm_arch_prepare_memory_region(struct kvm * kvm,struct kvm_memory_slot * memslot,const struct kvm_userspace_memory_region * mem,enum kvm_mr_change change)2275 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2276 struct kvm_memory_slot *memslot, 2277 const struct kvm_userspace_memory_region *mem, 2278 enum kvm_mr_change change) 2279 { 2280 hva_t hva = mem->userspace_addr; 2281 hva_t reg_end = hva + mem->memory_size; 2282 bool writable = !(mem->flags & KVM_MEM_READONLY); 2283 int ret = 0; 2284 2285 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2286 change != KVM_MR_FLAGS_ONLY) 2287 return 0; 2288 2289 /* 2290 * Prevent userspace from creating a memory region outside of the IPA 2291 * space addressable by the KVM guest IPA space. 2292 */ 2293 if (memslot->base_gfn + memslot->npages >= 2294 (kvm_phys_size(kvm) >> PAGE_SHIFT)) 2295 return -EFAULT; 2296 2297 down_read(¤t->mm->mmap_sem); 2298 /* 2299 * A memory region could potentially cover multiple VMAs, and any holes 2300 * between them, so iterate over all of them to find out if we can map 2301 * any of them right now. 2302 * 2303 * +--------------------------------------------+ 2304 * +---------------+----------------+ +----------------+ 2305 * | : VMA 1 | VMA 2 | | VMA 3 : | 2306 * +---------------+----------------+ +----------------+ 2307 * | memory region | 2308 * +--------------------------------------------+ 2309 */ 2310 do { 2311 struct vm_area_struct *vma = find_vma(current->mm, hva); 2312 hva_t vm_start, vm_end; 2313 2314 if (!vma || vma->vm_start >= reg_end) 2315 break; 2316 2317 /* 2318 * Mapping a read-only VMA is only allowed if the 2319 * memory region is configured as read-only. 2320 */ 2321 if (writable && !(vma->vm_flags & VM_WRITE)) { 2322 ret = -EPERM; 2323 break; 2324 } 2325 2326 /* 2327 * Take the intersection of this VMA with the memory region 2328 */ 2329 vm_start = max(hva, vma->vm_start); 2330 vm_end = min(reg_end, vma->vm_end); 2331 2332 if (vma->vm_flags & VM_PFNMAP) { 2333 gpa_t gpa = mem->guest_phys_addr + 2334 (vm_start - mem->userspace_addr); 2335 phys_addr_t pa; 2336 2337 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; 2338 pa += vm_start - vma->vm_start; 2339 2340 /* IO region dirty page logging not allowed */ 2341 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2342 ret = -EINVAL; 2343 goto out; 2344 } 2345 2346 ret = kvm_phys_addr_ioremap(kvm, gpa, pa, 2347 vm_end - vm_start, 2348 writable); 2349 if (ret) 2350 break; 2351 } 2352 hva = vm_end; 2353 } while (hva < reg_end); 2354 2355 if (change == KVM_MR_FLAGS_ONLY) 2356 goto out; 2357 2358 spin_lock(&kvm->mmu_lock); 2359 if (ret) 2360 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); 2361 else 2362 stage2_flush_memslot(kvm, memslot); 2363 spin_unlock(&kvm->mmu_lock); 2364 out: 2365 up_read(¤t->mm->mmap_sem); 2366 return ret; 2367 } 2368 kvm_arch_free_memslot(struct kvm * kvm,struct kvm_memory_slot * free,struct kvm_memory_slot * dont)2369 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 2370 struct kvm_memory_slot *dont) 2371 { 2372 } 2373 kvm_arch_create_memslot(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned long npages)2374 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, 2375 unsigned long npages) 2376 { 2377 return 0; 2378 } 2379 kvm_arch_memslots_updated(struct kvm * kvm,u64 gen)2380 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2381 { 2382 } 2383 kvm_arch_flush_shadow_all(struct kvm * kvm)2384 void kvm_arch_flush_shadow_all(struct kvm *kvm) 2385 { 2386 kvm_free_stage2_pgd(kvm); 2387 } 2388 kvm_arch_flush_shadow_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)2389 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2390 struct kvm_memory_slot *slot) 2391 { 2392 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2393 phys_addr_t size = slot->npages << PAGE_SHIFT; 2394 2395 spin_lock(&kvm->mmu_lock); 2396 unmap_stage2_range(kvm, gpa, size); 2397 spin_unlock(&kvm->mmu_lock); 2398 } 2399 2400 /* 2401 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2402 * 2403 * Main problems: 2404 * - S/W ops are local to a CPU (not broadcast) 2405 * - We have line migration behind our back (speculation) 2406 * - System caches don't support S/W at all (damn!) 2407 * 2408 * In the face of the above, the best we can do is to try and convert 2409 * S/W ops to VA ops. Because the guest is not allowed to infer the 2410 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2411 * which is a rather good thing for us. 2412 * 2413 * Also, it is only used when turning caches on/off ("The expected 2414 * usage of the cache maintenance instructions that operate by set/way 2415 * is associated with the cache maintenance instructions associated 2416 * with the powerdown and powerup of caches, if this is required by 2417 * the implementation."). 2418 * 2419 * We use the following policy: 2420 * 2421 * - If we trap a S/W operation, we enable VM trapping to detect 2422 * caches being turned on/off, and do a full clean. 2423 * 2424 * - We flush the caches on both caches being turned on and off. 2425 * 2426 * - Once the caches are enabled, we stop trapping VM ops. 2427 */ kvm_set_way_flush(struct kvm_vcpu * vcpu)2428 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2429 { 2430 unsigned long hcr = *vcpu_hcr(vcpu); 2431 2432 /* 2433 * If this is the first time we do a S/W operation 2434 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2435 * VM trapping. 2436 * 2437 * Otherwise, rely on the VM trapping to wait for the MMU + 2438 * Caches to be turned off. At that point, we'll be able to 2439 * clean the caches again. 2440 */ 2441 if (!(hcr & HCR_TVM)) { 2442 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2443 vcpu_has_cache_enabled(vcpu)); 2444 stage2_flush_vm(vcpu->kvm); 2445 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2446 } 2447 } 2448 kvm_toggle_cache(struct kvm_vcpu * vcpu,bool was_enabled)2449 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2450 { 2451 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2452 2453 /* 2454 * If switching the MMU+caches on, need to invalidate the caches. 2455 * If switching it off, need to clean the caches. 2456 * Clean + invalidate does the trick always. 2457 */ 2458 if (now_enabled != was_enabled) 2459 stage2_flush_vm(vcpu->kvm); 2460 2461 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2462 if (now_enabled) 2463 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2464 2465 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2466 } 2467