• Home
  • Raw
  • Download

Lines Matching +full:wp +full:- +full:content

1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
5 * This module enables machines with Intel VT-x extensions to run virtual
59 static int __read_mostly nx_huge_pages = -1;
90 * When setting this variable to true it enables Two-Dimensional-Paging
92 * 1. the guest-virtual to guest-physical
93 * 2. while doing 1. it walks guest-physical to host-physical
120 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
123 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
124 * PT32_LEVEL_BITS))) - 1))
127 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
132 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
134 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
135 * PT32_LEVEL_BITS))) - 1))
192 int ret = -ENOTSUPP; in kvm_flush_remote_tlbs_with_range()
256 gen = kvm_vcpu_memslots(vcpu)->generation; in check_mmio_spte()
280 return vcpu->arch.efer & EFER_NX; in is_nx()
285 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; in pse36_gfn_delta()
328 sp->clear_spte_count++; in count_spte_clear()
338 ssptep->spte_high = sspte.spte_high; in __set_spte()
347 WRITE_ONCE(ssptep->spte_low, sspte.spte_low); in __set_spte()
357 WRITE_ONCE(ssptep->spte_low, sspte.spte_low); in __update_clear_spte_fast()
365 ssptep->spte_high = sspte.spte_high; in __update_clear_spte_fast()
377 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); in __update_clear_spte_slow()
378 orig.spte_high = ssptep->spte_high; in __update_clear_spte_slow()
379 ssptep->spte_high = sspte.spte_high; in __update_clear_spte_slow()
391 * we need to protect against in-progress updates of the spte.
394 * for the high part of the spte. The race is fine for a present->non-present
395 * change (because the high part of the spte is ignored for non-present spte),
396 * but for a present->present change we must reread the spte.
398 * All such changes are done in two steps (present->non-present and
399 * non-present->present), hence it is enough to count the number of
400 * present->non-present updates: if it changed while reading the spte,
410 count = sp->clear_spte_count; in __get_spte_lockless()
413 spte.spte_low = orig->spte_low; in __get_spte_lockless()
416 spte.spte_high = orig->spte_high; in __get_spte_lockless()
419 if (unlikely(spte.spte_low != orig->spte_low || in __get_spte_lockless()
420 count != sp->clear_spte_count)) in __get_spte_lockless()
434 * out of mmu-lock, it can ensure dirty bit is not lost, in spte_has_volatile_bits()
491 * Whenever we overwrite a writable spte with a read-only one we
493 * will find a read-only spte, even though the writable spte
508 * For the spte updated out of mmu-lock is safe, since in mmu_spte_update()
538 * Returns non-zero if the PTE was previously valid.
586 /* Restore an acc-track PTE back to a regular PTE */
613 clear_bit((ffs(shadow_accessed_mask) - 1), in mmu_spte_age()
633 * Prevent page table teardown by making any free-er wait during in walk_shadow_page_lockless_begin()
640 * to vcpu->mode. in walk_shadow_page_lockless_begin()
642 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); in walk_shadow_page_lockless_begin()
648 * Make sure the write to vcpu->mode is not reordered in front of in walk_shadow_page_lockless_end()
652 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); in walk_shadow_page_lockless_end()
661 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, in mmu_topup_memory_caches()
665 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, in mmu_topup_memory_caches()
670 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, in mmu_topup_memory_caches()
675 return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, in mmu_topup_memory_caches()
681 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); in mmu_free_memory_caches()
682 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); in mmu_free_memory_caches()
683 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); in mmu_free_memory_caches()
684 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); in mmu_free_memory_caches()
689 return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); in mmu_alloc_pte_list_desc()
699 if (!sp->role.direct) in kvm_mmu_page_get_gfn()
700 return sp->gfns[index]; in kvm_mmu_page_get_gfn()
702 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); in kvm_mmu_page_get_gfn()
707 if (!sp->role.direct) { in kvm_mmu_page_set_gfn()
708 sp->gfns[index] = gfn; in kvm_mmu_page_set_gfn()
715 sp->gfn, in kvm_mmu_page_set_gfn()
729 idx = gfn_to_index(gfn, slot->base_gfn, level); in lpage_info_slot()
730 return &slot->arch.lpage_info[level - 2][idx]; in lpage_info_slot()
741 linfo->disallow_lpage += count; in update_gfn_disallow_lpage_count()
742 WARN_ON(linfo->disallow_lpage < 0); in update_gfn_disallow_lpage_count()
753 update_gfn_disallow_lpage_count(slot, gfn, -1); in kvm_mmu_gfn_allow_lpage()
762 kvm->arch.indirect_shadow_pages++; in account_shadowed()
763 gfn = sp->gfn; in account_shadowed()
764 slots = kvm_memslots_for_spte_role(kvm, sp->role); in account_shadowed()
767 /* the non-leaf shadow pages are keeping readonly. */ in account_shadowed()
768 if (sp->role.level > PG_LEVEL_4K) in account_shadowed()
777 if (sp->lpage_disallowed) in account_huge_nx_page()
780 ++kvm->stat.nx_lpage_splits; in account_huge_nx_page()
781 list_add_tail(&sp->lpage_disallowed_link, in account_huge_nx_page()
782 &kvm->arch.lpage_disallowed_mmu_pages); in account_huge_nx_page()
783 sp->lpage_disallowed = true; in account_huge_nx_page()
792 kvm->arch.indirect_shadow_pages--; in unaccount_shadowed()
793 gfn = sp->gfn; in unaccount_shadowed()
794 slots = kvm_memslots_for_spte_role(kvm, sp->role); in unaccount_shadowed()
796 if (sp->role.level > PG_LEVEL_4K) in unaccount_shadowed()
805 --kvm->stat.nx_lpage_splits; in unaccount_huge_nx_page()
806 sp->lpage_disallowed = false; in unaccount_huge_nx_page()
807 list_del(&sp->lpage_disallowed_link); in unaccount_huge_nx_page()
817 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) in gfn_to_memslot_dirty_bitmap()
819 if (no_dirty_log && slot->dirty_bitmap) in gfn_to_memslot_dirty_bitmap()
828 * If the bit zero of rmap_head->val is clear, then it points to the only spte
829 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
842 if (!rmap_head->val) { in pte_list_add()
843 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); in pte_list_add()
844 rmap_head->val = (unsigned long)spte; in pte_list_add()
845 } else if (!(rmap_head->val & 1)) { in pte_list_add()
846 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); in pte_list_add()
848 desc->sptes[0] = (u64 *)rmap_head->val; in pte_list_add()
849 desc->sptes[1] = spte; in pte_list_add()
850 rmap_head->val = (unsigned long)desc | 1; in pte_list_add()
853 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); in pte_list_add()
854 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); in pte_list_add()
855 while (desc->sptes[PTE_LIST_EXT-1]) { in pte_list_add()
858 if (!desc->more) { in pte_list_add()
859 desc->more = mmu_alloc_pte_list_desc(vcpu); in pte_list_add()
860 desc = desc->more; in pte_list_add()
863 desc = desc->more; in pte_list_add()
865 for (i = 0; desc->sptes[i]; ++i) in pte_list_add()
867 desc->sptes[i] = spte; in pte_list_add()
879 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) in pte_list_desc_remove_entry()
881 desc->sptes[i] = desc->sptes[j]; in pte_list_desc_remove_entry()
882 desc->sptes[j] = NULL; in pte_list_desc_remove_entry()
885 if (!prev_desc && !desc->more) in pte_list_desc_remove_entry()
886 rmap_head->val = 0; in pte_list_desc_remove_entry()
889 prev_desc->more = desc->more; in pte_list_desc_remove_entry()
891 rmap_head->val = (unsigned long)desc->more | 1; in pte_list_desc_remove_entry()
901 if (!rmap_head->val) { in __pte_list_remove()
902 pr_err("%s: %p 0->BUG\n", __func__, spte); in __pte_list_remove()
904 } else if (!(rmap_head->val & 1)) { in __pte_list_remove()
905 rmap_printk("%s: %p 1->0\n", __func__, spte); in __pte_list_remove()
906 if ((u64 *)rmap_head->val != spte) { in __pte_list_remove()
907 pr_err("%s: %p 1->BUG\n", __func__, spte); in __pte_list_remove()
910 rmap_head->val = 0; in __pte_list_remove()
912 rmap_printk("%s: %p many->many\n", __func__, spte); in __pte_list_remove()
913 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); in __pte_list_remove()
916 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { in __pte_list_remove()
917 if (desc->sptes[i] == spte) { in __pte_list_remove()
924 desc = desc->more; in __pte_list_remove()
926 pr_err("%s: %p many->many\n", __func__, spte); in __pte_list_remove()
942 idx = gfn_to_index(gfn, slot->base_gfn, level); in __gfn_to_rmap()
943 return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; in __gfn_to_rmap()
952 slots = kvm_memslots_for_spte_role(kvm, sp->role); in gfn_to_rmap()
954 return __gfn_to_rmap(gfn, sp->role.level, slot); in gfn_to_rmap()
961 mc = &vcpu->arch.mmu_pte_list_desc_cache; in rmap_can_add()
971 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); in rmap_add()
972 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); in rmap_add()
983 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); in rmap_remove()
1010 if (!rmap_head->val) in rmap_get_first()
1013 if (!(rmap_head->val & 1)) { in rmap_get_first()
1014 iter->desc = NULL; in rmap_get_first()
1015 sptep = (u64 *)rmap_head->val; in rmap_get_first()
1019 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); in rmap_get_first()
1020 iter->pos = 0; in rmap_get_first()
1021 sptep = iter->desc->sptes[iter->pos]; in rmap_get_first()
1036 if (iter->desc) { in rmap_get_next()
1037 if (iter->pos < PTE_LIST_EXT - 1) { in rmap_get_next()
1038 ++iter->pos; in rmap_get_next()
1039 sptep = iter->desc->sptes[iter->pos]; in rmap_get_next()
1044 iter->desc = iter->desc->more; in rmap_get_next()
1046 if (iter->desc) { in rmap_get_next()
1047 iter->pos = 0; in rmap_get_next()
1048 /* desc->sptes[0] cannot be NULL */ in rmap_get_next()
1049 sptep = iter->desc->sptes[iter->pos]; in rmap_get_next()
1074 WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); in __drop_large_spte()
1076 --kvm->stat.lpages; in __drop_large_spte()
1085 if (__drop_large_spte(vcpu->kvm, sptep)) { in drop_large_spte()
1088 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, in drop_large_spte()
1089 KVM_PAGES_PER_HPAGE(sp->role.level)); in drop_large_spte()
1094 * Write-protect on the specified @sptep, @pt_protect indicates whether
1095 * spte write-protection is caused by protecting shadow page table.
1099 * - for dirty logging, the spte can be set to writable at anytime if
1101 * - for spte protection, the spte can be writable only after unsync-ing
1160 * - D bit on ad-enabled SPTEs, and
1161 * - W bit on ad-disabled SPTEs.
1209 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1224 if (kvm->arch.tdp_mmu_enabled) in kvm_mmu_write_protect_pt_masked()
1226 slot->base_gfn + gfn_offset, mask, true); in kvm_mmu_write_protect_pt_masked()
1228 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), in kvm_mmu_write_protect_pt_masked()
1233 mask &= mask - 1; in kvm_mmu_write_protect_pt_masked()
1238 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1239 * protect the page if the D-bit isn't supported.
1241 * @slot: slot to clear D-bit
1243 * @mask: indicates which pages we should clear D-bit
1245 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1253 if (kvm->arch.tdp_mmu_enabled) in kvm_mmu_clear_dirty_pt_masked()
1255 slot->base_gfn + gfn_offset, mask, false); in kvm_mmu_clear_dirty_pt_masked()
1257 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), in kvm_mmu_clear_dirty_pt_masked()
1262 mask &= mask - 1; in kvm_mmu_clear_dirty_pt_masked()
1268 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1300 if (kvm->arch.tdp_mmu_enabled) in kvm_mmu_slot_gfn_write_protect()
1312 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); in rmap_write_protect()
1399 iterator->level = level; in rmap_walk_init_level()
1400 iterator->gfn = iterator->start_gfn; in rmap_walk_init_level()
1401 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); in rmap_walk_init_level()
1402 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, in rmap_walk_init_level()
1403 iterator->slot); in rmap_walk_init_level()
1411 iterator->slot = slot; in slot_rmap_walk_init()
1412 iterator->start_level = start_level; in slot_rmap_walk_init()
1413 iterator->end_level = end_level; in slot_rmap_walk_init()
1414 iterator->start_gfn = start_gfn; in slot_rmap_walk_init()
1415 iterator->end_gfn = end_gfn; in slot_rmap_walk_init()
1417 rmap_walk_init_level(iterator, iterator->start_level); in slot_rmap_walk_init()
1422 return !!iterator->rmap; in slot_rmap_walk_okay()
1427 if (++iterator->rmap <= iterator->end_rmap) { in slot_rmap_walk_next()
1428 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); in slot_rmap_walk_next()
1432 if (++iterator->level > iterator->end_level) { in slot_rmap_walk_next()
1433 iterator->rmap = NULL; in slot_rmap_walk_next()
1437 rmap_walk_init_level(iterator, iterator->level); in slot_rmap_walk_next()
1470 hva_start = max(start, memslot->userspace_addr); in kvm_handle_hva_range()
1471 hva_end = min(end, memslot->userspace_addr + in kvm_handle_hva_range()
1472 (memslot->npages << PAGE_SHIFT)); in kvm_handle_hva_range()
1477 * {gfn_start, gfn_start+1, ..., gfn_end-1}. in kvm_handle_hva_range()
1480 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); in kvm_handle_hva_range()
1484 gfn_start, gfn_end - 1, in kvm_handle_hva_range()
1512 if (kvm->arch.tdp_mmu_enabled) in kvm_unmap_hva_range()
1524 if (kvm->arch.tdp_mmu_enabled) in kvm_set_spte_hva()
1567 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); in rmap_recycle()
1569 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); in rmap_recycle()
1570 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, in rmap_recycle()
1571 KVM_PAGES_PER_HPAGE(sp->role.level)); in rmap_recycle()
1579 if (kvm->arch.tdp_mmu_enabled) in kvm_age_hva()
1590 if (kvm->arch.tdp_mmu_enabled) in kvm_test_age_hva()
1614 * kvm->arch.n_used_mmu_pages values. We need a global,
1620 kvm->arch.n_used_mmu_pages += nr; in kvm_mod_used_mmu_pages()
1626 MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); in kvm_mmu_free_page()
1627 hlist_del(&sp->hash_link); in kvm_mmu_free_page()
1628 list_del(&sp->link); in kvm_mmu_free_page()
1629 free_page((unsigned long)sp->spt); in kvm_mmu_free_page()
1630 if (!sp->role.direct) in kvm_mmu_free_page()
1631 free_page((unsigned long)sp->gfns); in kvm_mmu_free_page()
1646 pte_list_add(vcpu, parent_pte, &sp->parent_ptes); in mmu_page_add_parent_pte()
1652 __pte_list_remove(parent_pte, &sp->parent_ptes); in mmu_page_remove_parent_pte()
1666 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); in kvm_mmu_alloc_page()
1667 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); in kvm_mmu_alloc_page()
1669 sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); in kvm_mmu_alloc_page()
1670 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); in kvm_mmu_alloc_page()
1677 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; in kvm_mmu_alloc_page()
1678 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); in kvm_mmu_alloc_page()
1679 kvm_mod_used_mmu_pages(vcpu->kvm, +1); in kvm_mmu_alloc_page()
1689 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { in kvm_mmu_mark_parents_unsync()
1700 index = spte - sp->spt; in mark_unsync()
1701 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) in mark_unsync()
1703 if (sp->unsync_children++) in mark_unsync()
1729 if (sp->unsync) in mmu_pages_add()
1730 for (i=0; i < pvec->nr; i++) in mmu_pages_add()
1731 if (pvec->page[i].sp == sp) in mmu_pages_add()
1734 pvec->page[pvec->nr].sp = sp; in mmu_pages_add()
1735 pvec->page[pvec->nr].idx = idx; in mmu_pages_add()
1736 pvec->nr++; in mmu_pages_add()
1737 return (pvec->nr == KVM_PAGE_ARRAY_NR); in mmu_pages_add()
1742 --sp->unsync_children; in clear_unsync_child_bit()
1743 WARN_ON((int)sp->unsync_children < 0); in clear_unsync_child_bit()
1744 __clear_bit(idx, sp->unsync_child_bitmap); in clear_unsync_child_bit()
1752 for_each_set_bit(i, sp->unsync_child_bitmap, 512) { in __mmu_unsync_walk()
1754 u64 ent = sp->spt[i]; in __mmu_unsync_walk()
1763 if (child->unsync_children) { in __mmu_unsync_walk()
1765 return -ENOSPC; in __mmu_unsync_walk()
1775 } else if (child->unsync) { in __mmu_unsync_walk()
1778 return -ENOSPC; in __mmu_unsync_walk()
1786 #define INVALID_INDEX (-1)
1791 pvec->nr = 0; in mmu_unsync_walk()
1792 if (!sp->unsync_children) in mmu_unsync_walk()
1801 WARN_ON(!sp->unsync); in kvm_unlink_unsync_page()
1803 sp->unsync = 0; in kvm_unlink_unsync_page()
1804 --kvm->stat.mmu_unsync; in kvm_unlink_unsync_page()
1819 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
1820 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
1824 return sp->role.cr0_wp && sp->role.smap_andnot_wp; in is_ept_sp()
1827 /* @sp->gfn should be write-protected at the call site */
1831 if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) || in __kvm_sync_page()
1832 vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { in __kvm_sync_page()
1833 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); in __kvm_sync_page()
1858 if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush)) in kvm_mmu_flush_or_zap()
1874 return sp->role.invalid || in is_obsolete_sp()
1875 unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); in is_obsolete_sp()
1881 kvm_unlink_unsync_page(vcpu->kvm, sp); in kvm_sync_page()
1885 /* @gfn should be write-protected at the call site */
1892 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { in kvm_sync_pages()
1893 if (!s->unsync) in kvm_sync_pages()
1896 WARN_ON(s->role.level != PG_LEVEL_4K); in kvm_sync_pages()
1919 for (n = i+1; n < pvec->nr; n++) { in mmu_pages_next()
1920 struct kvm_mmu_page *sp = pvec->page[n].sp; in mmu_pages_next()
1921 unsigned idx = pvec->page[n].idx; in mmu_pages_next()
1922 int level = sp->role.level; in mmu_pages_next()
1924 parents->idx[level-1] = idx; in mmu_pages_next()
1928 parents->parent[level-2] = sp; in mmu_pages_next()
1940 if (pvec->nr == 0) in mmu_pages_first()
1943 WARN_ON(pvec->page[0].idx != INVALID_INDEX); in mmu_pages_first()
1945 sp = pvec->page[0].sp; in mmu_pages_first()
1946 level = sp->role.level; in mmu_pages_first()
1949 parents->parent[level-2] = sp; in mmu_pages_first()
1954 parents->parent[level-1] = NULL; in mmu_pages_first()
1964 unsigned int idx = parents->idx[level]; in mmu_pages_clear_parents()
1965 sp = parents->parent[level]; in mmu_pages_clear_parents()
1972 } while (!sp->unsync_children); in mmu_pages_clear_parents()
1989 protected |= rmap_write_protect(vcpu, sp->gfn); in mmu_sync_children()
1992 kvm_flush_remote_tlbs(vcpu->kvm); in mmu_sync_children()
2000 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { in mmu_sync_children()
2002 cond_resched_lock(&vcpu->kvm->mmu_lock); in mmu_sync_children()
2012 atomic_set(&sp->write_flooding_count, 0); in __clear_sp_write_flooding_count()
2027 bool direct_mmu = vcpu->arch.mmu->direct_map; in kvm_mmu_get_page()
2037 role = vcpu->arch.mmu->mmu_role.base; in kvm_mmu_get_page()
2043 if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { in kvm_mmu_get_page()
2045 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; in kvm_mmu_get_page()
2049 sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; in kvm_mmu_get_page()
2050 for_each_valid_sp(vcpu->kvm, sp, sp_list) { in kvm_mmu_get_page()
2051 if (sp->gfn != gfn) { in kvm_mmu_get_page()
2056 if (!need_sync && sp->unsync) in kvm_mmu_get_page()
2059 if (sp->role.word != role.word) in kvm_mmu_get_page()
2065 if (sp->unsync) { in kvm_mmu_get_page()
2076 if (sp->unsync_children) in kvm_mmu_get_page()
2086 ++vcpu->kvm->stat.mmu_cache_miss; in kvm_mmu_get_page()
2090 sp->gfn = gfn; in kvm_mmu_get_page()
2091 sp->role = role; in kvm_mmu_get_page()
2092 hlist_add_head(&sp->hash_link, sp_list); in kvm_mmu_get_page()
2096 * otherwise the content of the synced shadow page may in kvm_mmu_get_page()
2099 account_shadowed(vcpu->kvm, sp); in kvm_mmu_get_page()
2101 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); in kvm_mmu_get_page()
2110 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) in kvm_mmu_get_page()
2111 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; in kvm_mmu_get_page()
2119 iterator->addr = addr; in shadow_walk_init_using_root()
2120 iterator->shadow_addr = root; in shadow_walk_init_using_root()
2121 iterator->level = vcpu->arch.mmu->shadow_root_level; in shadow_walk_init_using_root()
2123 if (iterator->level == PT64_ROOT_4LEVEL && in shadow_walk_init_using_root()
2124 vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && in shadow_walk_init_using_root()
2125 !vcpu->arch.mmu->direct_map) in shadow_walk_init_using_root()
2126 --iterator->level; in shadow_walk_init_using_root()
2128 if (iterator->level == PT32E_ROOT_LEVEL) { in shadow_walk_init_using_root()
2130 * prev_root is currently only used for 64-bit hosts. So only in shadow_walk_init_using_root()
2133 BUG_ON(root != vcpu->arch.mmu->root_hpa); in shadow_walk_init_using_root()
2135 iterator->shadow_addr in shadow_walk_init_using_root()
2136 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; in shadow_walk_init_using_root()
2137 iterator->shadow_addr &= PT64_BASE_ADDR_MASK; in shadow_walk_init_using_root()
2138 --iterator->level; in shadow_walk_init_using_root()
2139 if (!iterator->shadow_addr) in shadow_walk_init_using_root()
2140 iterator->level = 0; in shadow_walk_init_using_root()
2147 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa, in shadow_walk_init()
2153 if (iterator->level < PG_LEVEL_4K) in shadow_walk_okay()
2156 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); in shadow_walk_okay()
2157 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; in shadow_walk_okay()
2164 if (is_last_spte(spte, iterator->level)) { in __shadow_walk_next()
2165 iterator->level = 0; in __shadow_walk_next()
2169 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; in __shadow_walk_next()
2170 --iterator->level; in __shadow_walk_next()
2175 __shadow_walk_next(iterator, *iterator->sptep); in shadow_walk_next()
2185 spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); in link_shadow_page()
2191 if (sp->unsync_children || sp->unsync) in link_shadow_page()
2204 * sp's access: allow writable in the read-only sp, in validate_direct_spte()
2209 if (child->role.access == direct_access) in validate_direct_spte()
2213 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); in validate_direct_spte()
2217 /* Returns the number of zapped non-leaf child shadow pages. */
2226 if (is_last_spte(pte, sp->role.level)) { in mmu_page_zap_pte()
2229 --kvm->stat.lpages; in mmu_page_zap_pte()
2240 child->role.guest_mode && !child->parent_ptes.val) in mmu_page_zap_pte()
2258 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); in kvm_mmu_page_unlink_children()
2268 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) in kvm_mmu_unlink_parents()
2280 if (parent->role.level == PG_LEVEL_4K) in mmu_zap_unsync_children()
2304 ++kvm->stat.mmu_shadow_zapped; in __kvm_mmu_prepare_zap_page()
2312 if (!sp->role.invalid && !sp->role.direct) in __kvm_mmu_prepare_zap_page()
2315 if (sp->unsync) in __kvm_mmu_prepare_zap_page()
2317 if (!sp->root_count) { in __kvm_mmu_prepare_zap_page()
2324 * !sp->root_count. in __kvm_mmu_prepare_zap_page()
2326 if (sp->role.invalid) in __kvm_mmu_prepare_zap_page()
2327 list_add(&sp->link, invalid_list); in __kvm_mmu_prepare_zap_page()
2329 list_move(&sp->link, invalid_list); in __kvm_mmu_prepare_zap_page()
2330 kvm_mod_used_mmu_pages(kvm, -1); in __kvm_mmu_prepare_zap_page()
2336 list_del(&sp->link); in __kvm_mmu_prepare_zap_page()
2347 if (sp->lpage_disallowed) in __kvm_mmu_prepare_zap_page()
2350 sp->role.invalid = 1; in __kvm_mmu_prepare_zap_page()
2373 * the page tables and see changes to vcpu->mode here. The barrier in kvm_mmu_commit_zap_page()
2383 WARN_ON(!sp->role.invalid || sp->root_count); in kvm_mmu_commit_zap_page()
2397 if (list_empty(&kvm->arch.active_mmu_pages)) in kvm_mmu_zap_oldest_mmu_pages()
2401 list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { in kvm_mmu_zap_oldest_mmu_pages()
2406 if (sp->root_count) in kvm_mmu_zap_oldest_mmu_pages()
2421 kvm->stat.mmu_recycled += total_zapped; in kvm_mmu_zap_oldest_mmu_pages()
2427 if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) in kvm_mmu_available_pages()
2428 return kvm->arch.n_max_mmu_pages - in kvm_mmu_available_pages()
2429 kvm->arch.n_used_mmu_pages; in kvm_mmu_available_pages()
2436 unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); in make_mmu_pages_available()
2441 kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); in make_mmu_pages_available()
2443 if (!kvm_mmu_available_pages(vcpu->kvm)) in make_mmu_pages_available()
2444 return -ENOSPC; in make_mmu_pages_available()
2454 spin_lock(&kvm->mmu_lock); in kvm_mmu_change_mmu_pages()
2456 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { in kvm_mmu_change_mmu_pages()
2457 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - in kvm_mmu_change_mmu_pages()
2460 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; in kvm_mmu_change_mmu_pages()
2463 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; in kvm_mmu_change_mmu_pages()
2465 spin_unlock(&kvm->mmu_lock); in kvm_mmu_change_mmu_pages()
2476 spin_lock(&kvm->mmu_lock); in kvm_mmu_unprotect_page()
2479 sp->role.word); in kvm_mmu_unprotect_page()
2484 spin_unlock(&kvm->mmu_lock); in kvm_mmu_unprotect_page()
2493 ++vcpu->kvm->stat.mmu_unsync; in kvm_unsync_page()
2494 sp->unsync = 1; in kvm_unsync_page()
2507 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { in mmu_need_write_protect()
2511 if (sp->unsync) in mmu_need_write_protect()
2514 WARN_ON(sp->role.level != PG_LEVEL_4K); in mmu_need_write_protect()
2523 * before the page had been marked as unsync-ed, something like the in mmu_need_write_protect()
2527 * --------------------------------------------------------------------- in mmu_need_write_protect()
2540 * 2.3 kvm_mmu_sync_pages() reads sp->unsync. in mmu_need_write_protect()
2549 * (sp->unsync = true) in mmu_need_write_protect()
2616 drop_spte(vcpu->kvm, sptep); in mmu_set_spte()
2631 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, in mmu_set_spte()
2649 ++vcpu->kvm->stat.lpages; in mmu_set_spte()
2680 unsigned int access = sp->role.access; in direct_pte_prefetch_many()
2684 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); in direct_pte_prefetch_many()
2687 return -1; in direct_pte_prefetch_many()
2689 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); in direct_pte_prefetch_many()
2691 return -1; in direct_pte_prefetch_many()
2694 mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, in direct_pte_prefetch_many()
2708 WARN_ON(!sp->role.direct); in __direct_pte_prefetch()
2710 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); in __direct_pte_prefetch()
2711 spte = sp->spt + i; in __direct_pte_prefetch()
2739 if (sp->role.level > PG_LEVEL_4K) in direct_pte_prefetch()
2756 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() in host_pfn_mapping_level()
2759 * read-only memslots due to gfn_to_hva() assuming writes. Earlier in host_pfn_mapping_level()
2761 * read-only memslot. in host_pfn_mapping_level()
2765 pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); in host_pfn_mapping_level()
2795 for ( ; max_level > PG_LEVEL_4K; max_level--) { in kvm_mmu_hugepage_adjust()
2797 if (!linfo->disallow_lpage) in kvm_mmu_hugepage_adjust()
2821 mask = KVM_PAGES_PER_HPAGE(level) - 1; in kvm_mmu_hugepage_adjust()
2843 u64 page_mask = KVM_PAGES_PER_HPAGE(level) - in disallowed_hugepage_adjust()
2844 KVM_PAGES_PER_HPAGE(level - 1); in disallowed_hugepage_adjust()
2846 (*goal_levelp)--; in disallowed_hugepage_adjust()
2864 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) in __direct_map()
2880 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); in __direct_map()
2887 it.level - 1, true, ACC_ALL); in __direct_map()
2892 account_huge_nx_page(vcpu->kvm, sp); in __direct_map()
2903 ++vcpu->stat.pf_fixed; in __direct_map()
2927 return -EFAULT; in kvm_handle_bad_page()
2966 * is caused by write-protect, that means we just need change the W in page_fault_can_be_fast()
2967 * bit of the spte which can be done out of mmu-lock. in page_fault_can_be_fast()
2969 * However, if access tracking is disabled we know that a non-present in page_fault_can_be_fast()
2990 WARN_ON(!sp->role.direct); in fast_pf_fix_direct_spte()
3000 * so non-PML cases won't be impacted. in fast_pf_fix_direct_spte()
3010 * calculated by sp->gfn. in fast_pf_fix_direct_spte()
3012 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); in fast_pf_fix_direct_spte()
3056 if (!is_last_spte(spte, sp->role.level)) in fast_page_fault()
3080 * Currently, to simplify the code, write-protection can in fast_page_fault()
3082 * write-protected for dirty-logging or access tracking. in fast_page_fault()
3089 * Do not fix write-permission on the large spte. Since in fast_page_fault()
3090 * we only dirty the first page into the dirty-bitmap in in fast_page_fault()
3099 if (sp->role.level > PG_LEVEL_4K) in fast_page_fault()
3145 if (sp->tdp_mmu_page) in mmu_free_root_page()
3147 else if (sp->role.invalid) in mmu_free_root_page()
3158 struct kvm *kvm = vcpu->kvm; in kvm_mmu_free_roots()
3166 if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) { in kvm_mmu_free_roots()
3169 VALID_PAGE(mmu->prev_roots[i].hpa)) in kvm_mmu_free_roots()
3176 spin_lock(&kvm->mmu_lock); in kvm_mmu_free_roots()
3180 mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, in kvm_mmu_free_roots()
3184 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && in kvm_mmu_free_roots()
3185 (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { in kvm_mmu_free_roots()
3186 mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); in kvm_mmu_free_roots()
3187 } else if (mmu->pae_root) { in kvm_mmu_free_roots()
3189 if (mmu->pae_root[i] != 0) in kvm_mmu_free_roots()
3191 &mmu->pae_root[i], in kvm_mmu_free_roots()
3194 mmu->root_hpa = INVALID_PAGE; in kvm_mmu_free_roots()
3195 mmu->root_pgd = 0; in kvm_mmu_free_roots()
3199 spin_unlock(&kvm->mmu_lock); in kvm_mmu_free_roots()
3220 spin_lock(&vcpu->kvm->mmu_lock); in mmu_alloc_root()
3223 spin_unlock(&vcpu->kvm->mmu_lock); in mmu_alloc_root()
3227 ++sp->root_count; in mmu_alloc_root()
3229 spin_unlock(&vcpu->kvm->mmu_lock); in mmu_alloc_root()
3230 return __pa(sp->spt); in mmu_alloc_root()
3235 u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level; in mmu_alloc_direct_roots()
3239 if (vcpu->kvm->arch.tdp_mmu_enabled) { in mmu_alloc_direct_roots()
3243 return -ENOSPC; in mmu_alloc_direct_roots()
3244 vcpu->arch.mmu->root_hpa = root; in mmu_alloc_direct_roots()
3250 return -ENOSPC; in mmu_alloc_direct_roots()
3251 vcpu->arch.mmu->root_hpa = root; in mmu_alloc_direct_roots()
3254 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); in mmu_alloc_direct_roots()
3256 root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), in mmu_alloc_direct_roots()
3259 return -ENOSPC; in mmu_alloc_direct_roots()
3260 vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; in mmu_alloc_direct_roots()
3262 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); in mmu_alloc_direct_roots()
3267 vcpu->arch.mmu->root_pgd = 0; in mmu_alloc_direct_roots()
3279 root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu); in mmu_alloc_shadow_roots()
3287 * write-protect the guests page table root. in mmu_alloc_shadow_roots()
3289 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { in mmu_alloc_shadow_roots()
3290 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa)); in mmu_alloc_shadow_roots()
3293 vcpu->arch.mmu->shadow_root_level, false); in mmu_alloc_shadow_roots()
3295 return -ENOSPC; in mmu_alloc_shadow_roots()
3296 vcpu->arch.mmu->root_hpa = root; in mmu_alloc_shadow_roots()
3301 * We shadow a 32 bit page table. This may be a legacy 2-level in mmu_alloc_shadow_roots()
3302 * or a PAE 3-level page table. In either case we need to be aware that in mmu_alloc_shadow_roots()
3306 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { in mmu_alloc_shadow_roots()
3310 * Allocate the page for the PDPTEs when shadowing 32-bit NPT in mmu_alloc_shadow_roots()
3311 * with 64-bit only when needed. Unlike 32-bit NPT, it doesn't in mmu_alloc_shadow_roots()
3314 if (!vcpu->arch.mmu->pae_root) { in mmu_alloc_shadow_roots()
3317 vcpu->arch.mmu->pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); in mmu_alloc_shadow_roots()
3318 if (!vcpu->arch.mmu->pae_root) in mmu_alloc_shadow_roots()
3319 return -ENOMEM; in mmu_alloc_shadow_roots()
3324 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); in mmu_alloc_shadow_roots()
3325 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { in mmu_alloc_shadow_roots()
3326 pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); in mmu_alloc_shadow_roots()
3328 vcpu->arch.mmu->pae_root[i] = 0; in mmu_alloc_shadow_roots()
3339 return -ENOSPC; in mmu_alloc_shadow_roots()
3340 vcpu->arch.mmu->pae_root[i] = root | pm_mask; in mmu_alloc_shadow_roots()
3342 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); in mmu_alloc_shadow_roots()
3345 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP in mmu_alloc_shadow_roots()
3348 * on demand, as running a 32-bit L1 VMM is very rare. The PDP is in mmu_alloc_shadow_roots()
3351 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { in mmu_alloc_shadow_roots()
3352 if (vcpu->arch.mmu->lm_root == NULL) { in mmu_alloc_shadow_roots()
3357 return -ENOMEM; in mmu_alloc_shadow_roots()
3359 lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; in mmu_alloc_shadow_roots()
3361 vcpu->arch.mmu->lm_root = lm_root; in mmu_alloc_shadow_roots()
3364 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root); in mmu_alloc_shadow_roots()
3368 vcpu->arch.mmu->root_pgd = root_pgd; in mmu_alloc_shadow_roots()
3375 if (vcpu->arch.mmu->direct_map) in mmu_alloc_roots()
3386 if (vcpu->arch.mmu->direct_map) in kvm_mmu_sync_roots()
3389 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) in kvm_mmu_sync_roots()
3394 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { in kvm_mmu_sync_roots()
3395 hpa_t root = vcpu->arch.mmu->root_hpa; in kvm_mmu_sync_roots()
3399 * Even if another CPU was marking the SP as unsync-ed in kvm_mmu_sync_roots()
3408 if (!smp_load_acquire(&sp->unsync) && in kvm_mmu_sync_roots()
3409 !smp_load_acquire(&sp->unsync_children)) in kvm_mmu_sync_roots()
3412 spin_lock(&vcpu->kvm->mmu_lock); in kvm_mmu_sync_roots()
3418 spin_unlock(&vcpu->kvm->mmu_lock); in kvm_mmu_sync_roots()
3422 spin_lock(&vcpu->kvm->mmu_lock); in kvm_mmu_sync_roots()
3426 hpa_t root = vcpu->arch.mmu->pae_root[i]; in kvm_mmu_sync_roots()
3436 spin_unlock(&vcpu->kvm->mmu_lock); in kvm_mmu_sync_roots()
3444 exception->error_code = 0; in nonpaging_gva_to_gpa()
3453 exception->error_code = 0; in nonpaging_gva_to_gpa_nested()
3454 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); in nonpaging_gva_to_gpa_nested()
3462 return pte & rsvd_check->rsvd_bits_mask[bit7][level-1]; in __is_rsvd_bits_set()
3467 return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); in __is_bad_mt_xwr()
3487 * That SPTE may be non-present.
3492 int leaf = -1; in get_walk()
3504 sptes[leaf - 1] = spte; in get_walk()
3523 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { in get_mmio_spte()
3528 if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) in get_mmio_spte()
3538 rsvd_check = &vcpu->arch.mmu->shadow_zero_check; in get_mmio_spte()
3540 for (level = root; level >= leaf; level--) { in get_mmio_spte()
3541 if (!is_shadow_present_pte(sptes[level - 1])) in get_mmio_spte()
3544 * Use a bitwise-OR instead of a logical-OR to aggregate the in get_mmio_spte()
3548 reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) || in get_mmio_spte()
3549 __is_rsvd_bits_set(rsvd_check, sptes[level - 1], in get_mmio_spte()
3556 for (level = root; level >= leaf; level--) in get_mmio_spte()
3557 pr_err("------ spte 0x%llx level %d.\n", in get_mmio_spte()
3558 sptes[level - 1], level); in get_mmio_spte()
3561 *sptep = sptes[leaf - 1]; in get_mmio_spte()
3576 return -EINVAL; in handle_mmio_page_fault()
3639 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; in kvm_arch_setup_async_pf()
3641 arch.direct_map = vcpu->arch.mmu->direct_map; in kvm_arch_setup_async_pf()
3642 arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); in kvm_arch_setup_async_pf()
3660 if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) in try_async_pf()
3703 if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { in direct_page_fault()
3713 mmu_seq = vcpu->kvm->mmu_notifier_seq; in direct_page_fault()
3723 spin_lock(&vcpu->kvm->mmu_lock); in direct_page_fault()
3724 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) in direct_page_fault()
3730 if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) in direct_page_fault()
3738 spin_unlock(&vcpu->kvm->mmu_lock); in direct_page_fault()
3757 u32 flags = vcpu->arch.apf.host_apf_flags; in kvm_handle_page_fault()
3760 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ in kvm_handle_page_fault()
3762 return -EFAULT; in kvm_handle_page_fault()
3765 vcpu->arch.l1tf_flush_l1d = true; in kvm_handle_page_fault()
3774 vcpu->arch.apf.host_apf_flags = 0; in kvm_handle_page_fault()
3793 max_level--) { in kvm_tdp_page_fault()
3795 gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); in kvm_tdp_page_fault()
3808 context->page_fault = nonpaging_page_fault; in nonpaging_init_context()
3809 context->gva_to_gpa = nonpaging_gva_to_gpa; in nonpaging_init_context()
3810 context->sync_page = nonpaging_sync_page; in nonpaging_init_context()
3811 context->invlpg = NULL; in nonpaging_init_context()
3812 context->root_level = 0; in nonpaging_init_context()
3813 context->shadow_root_level = PT32E_ROOT_LEVEL; in nonpaging_init_context()
3814 context->direct_map = true; in nonpaging_init_context()
3815 context->nx = false; in nonpaging_init_context()
3821 return (role.direct || pgd == root->pgd) && in is_root_usable()
3822 VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) && in is_root_usable()
3823 role.word == to_shadow_page(root->hpa)->role.word; in is_root_usable()
3829 * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
3831 * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
3839 struct kvm_mmu *mmu = vcpu->arch.mmu; in cached_root_available()
3841 root.pgd = mmu->root_pgd; in cached_root_available()
3842 root.hpa = mmu->root_hpa; in cached_root_available()
3848 swap(root, mmu->prev_roots[i]); in cached_root_available()
3854 mmu->root_hpa = root.hpa; in cached_root_available()
3855 mmu->root_pgd = root.pgd; in cached_root_available()
3863 struct kvm_mmu *mmu = vcpu->arch.mmu; in fast_pgd_switch()
3866 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid in fast_pgd_switch()
3867 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs in fast_pgd_switch()
3870 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && in fast_pgd_switch()
3871 mmu->root_level >= PT64_ROOT_4LEVEL) in fast_pgd_switch()
3882 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); in __kvm_mmu_new_pgd()
3901 * switching to a new CR3, that GVA->GPA mapping may no longer be in __kvm_mmu_new_pgd()
3913 to_shadow_page(vcpu->arch.mmu->root_hpa)); in __kvm_mmu_new_pgd()
3950 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. in is_last_gpte()
3954 gpte &= level - mmu->last_nonleaf_level; in is_last_gpte()
3961 gpte |= level - PG_LEVEL_4K - 1; in is_last_gpte()
3989 rsvd_check->bad_mt_xwr = 0; in __reset_rsvds_bits_mask()
3997 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for in __reset_rsvds_bits_mask()
4006 rsvd_check->rsvd_bits_mask[0][1] = 0; in __reset_rsvds_bits_mask()
4007 rsvd_check->rsvd_bits_mask[0][0] = 0; in __reset_rsvds_bits_mask()
4008 rsvd_check->rsvd_bits_mask[1][0] = in __reset_rsvds_bits_mask()
4009 rsvd_check->rsvd_bits_mask[0][0]; in __reset_rsvds_bits_mask()
4012 rsvd_check->rsvd_bits_mask[1][1] = 0; in __reset_rsvds_bits_mask()
4018 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); in __reset_rsvds_bits_mask()
4021 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); in __reset_rsvds_bits_mask()
4024 rsvd_check->rsvd_bits_mask[0][2] = in __reset_rsvds_bits_mask()
4027 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | in __reset_rsvds_bits_mask()
4029 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | in __reset_rsvds_bits_mask()
4031 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | in __reset_rsvds_bits_mask()
4034 rsvd_check->rsvd_bits_mask[1][0] = in __reset_rsvds_bits_mask()
4035 rsvd_check->rsvd_bits_mask[0][0]; in __reset_rsvds_bits_mask()
4038 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | in __reset_rsvds_bits_mask()
4041 rsvd_check->rsvd_bits_mask[1][4] = in __reset_rsvds_bits_mask()
4042 rsvd_check->rsvd_bits_mask[0][4]; in __reset_rsvds_bits_mask()
4045 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | in __reset_rsvds_bits_mask()
4048 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | in __reset_rsvds_bits_mask()
4051 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | in __reset_rsvds_bits_mask()
4053 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | in __reset_rsvds_bits_mask()
4055 rsvd_check->rsvd_bits_mask[1][3] = in __reset_rsvds_bits_mask()
4056 rsvd_check->rsvd_bits_mask[0][3]; in __reset_rsvds_bits_mask()
4057 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | in __reset_rsvds_bits_mask()
4060 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | in __reset_rsvds_bits_mask()
4063 rsvd_check->rsvd_bits_mask[1][0] = in __reset_rsvds_bits_mask()
4064 rsvd_check->rsvd_bits_mask[0][0]; in __reset_rsvds_bits_mask()
4072 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, in reset_rsvds_bits_mask()
4073 cpuid_maxphyaddr(vcpu), context->root_level, in reset_rsvds_bits_mask()
4074 context->nx, in reset_rsvds_bits_mask()
4086 rsvd_check->rsvd_bits_mask[0][4] = in __reset_rsvds_bits_mask_ept()
4088 rsvd_check->rsvd_bits_mask[0][3] = in __reset_rsvds_bits_mask_ept()
4090 rsvd_check->rsvd_bits_mask[0][2] = in __reset_rsvds_bits_mask_ept()
4092 rsvd_check->rsvd_bits_mask[0][1] = in __reset_rsvds_bits_mask_ept()
4094 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); in __reset_rsvds_bits_mask_ept()
4097 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; in __reset_rsvds_bits_mask_ept()
4098 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; in __reset_rsvds_bits_mask_ept()
4099 rsvd_check->rsvd_bits_mask[1][2] = in __reset_rsvds_bits_mask_ept()
4101 rsvd_check->rsvd_bits_mask[1][1] = in __reset_rsvds_bits_mask_ept()
4103 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; in __reset_rsvds_bits_mask_ept()
4114 rsvd_check->bad_mt_xwr = bad_mt_xwr; in __reset_rsvds_bits_mask_ept()
4120 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, in reset_rsvds_bits_mask_ept()
4134 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and in reset_shadow_zero_bits_mask()
4135 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. in reset_shadow_zero_bits_mask()
4136 * The iTLB multi-hit workaround can be toggled at any time, so assume in reset_shadow_zero_bits_mask()
4137 * NX can be used by any non-nested shadow MMU to avoid having to reset in reset_shadow_zero_bits_mask()
4140 bool uses_nx = context->nx || !tdp_enabled || in reset_shadow_zero_bits_mask()
4141 context->mmu_role.base.smep_andnot_wp; in reset_shadow_zero_bits_mask()
4149 shadow_zero_check = &context->shadow_zero_check; in reset_shadow_zero_bits_mask()
4152 context->shadow_root_level, uses_nx, in reset_shadow_zero_bits_mask()
4159 for (i = context->shadow_root_level; --i >= 0;) { in reset_shadow_zero_bits_mask()
4160 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; in reset_shadow_zero_bits_mask()
4161 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; in reset_shadow_zero_bits_mask()
4175 * possible, however, kvm currently does not do execution-protection.
4184 shadow_zero_check = &context->shadow_zero_check; in reset_tdp_shadow_zero_bits_mask()
4189 context->shadow_root_level, false, in reset_tdp_shadow_zero_bits_mask()
4200 for (i = context->shadow_root_level; --i >= 0;) { in reset_tdp_shadow_zero_bits_mask()
4201 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; in reset_tdp_shadow_zero_bits_mask()
4202 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; in reset_tdp_shadow_zero_bits_mask()
4214 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, in reset_ept_shadow_zero_bits_mask()
4241 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { in update_permission_bitmask()
4249 /* Faults from writes to non-writable pages */ in update_permission_bitmask()
4253 /* Faults from fetches of non-executable pages*/ in update_permission_bitmask()
4265 if (!mmu->nx) in update_permission_bitmask()
4268 /* Allow supervisor writes if !cr0.wp */ in update_permission_bitmask()
4277 * SMAP:kernel-mode data accesses from user-mode in update_permission_bitmask()
4281 * - X86_CR4_SMAP is set in CR4 in update_permission_bitmask()
4282 * - A user page is accessed in update_permission_bitmask()
4283 * - The access is not a fetch in update_permission_bitmask()
4284 * - Page fault in kernel mode in update_permission_bitmask()
4285 * - if CPL = 3 or X86_EFLAGS_AC is clear in update_permission_bitmask()
4296 mmu->permissions[byte] = ff | uf | wf | smepf | smapf; in update_permission_bitmask()
4302 * user-mode addresses based on the value in the PKRU register. Protection
4311 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4312 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4313 * - PK is always zero if U=0 in the page tables
4314 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4328 bool wp; in update_pkru_bitmask() local
4331 mmu->pkru_mask = 0; in update_pkru_bitmask()
4337 mmu->pkru_mask = 0; in update_pkru_bitmask()
4341 wp = is_write_protection(vcpu); in update_pkru_bitmask()
4343 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { in update_pkru_bitmask()
4362 * user access or CR0.WP = 1. in update_pkru_bitmask()
4364 check_write = check_pkey && wf && (uf || wp); in update_pkru_bitmask()
4371 mmu->pkru_mask |= (pkey_bits & 3) << pfec; in update_pkru_bitmask()
4377 unsigned root_level = mmu->root_level; in update_last_nonleaf_level()
4379 mmu->last_nonleaf_level = root_level; in update_last_nonleaf_level()
4381 mmu->last_nonleaf_level++; in update_last_nonleaf_level()
4388 context->nx = is_nx(vcpu); in paging64_init_context_common()
4389 context->root_level = level; in paging64_init_context_common()
4397 context->page_fault = paging64_page_fault; in paging64_init_context_common()
4398 context->gva_to_gpa = paging64_gva_to_gpa; in paging64_init_context_common()
4399 context->sync_page = paging64_sync_page; in paging64_init_context_common()
4400 context->invlpg = paging64_invlpg; in paging64_init_context_common()
4401 context->shadow_root_level = level; in paging64_init_context_common()
4402 context->direct_map = false; in paging64_init_context_common()
4417 context->nx = false; in paging32_init_context()
4418 context->root_level = PT32_ROOT_LEVEL; in paging32_init_context()
4425 context->page_fault = paging32_page_fault; in paging32_init_context()
4426 context->gva_to_gpa = paging32_gva_to_gpa; in paging32_init_context()
4427 context->sync_page = paging32_sync_page; in paging32_init_context()
4428 context->invlpg = paging32_invlpg; in paging32_init_context()
4429 context->shadow_root_level = PT32E_ROOT_LEVEL; in paging32_init_context()
4430 context->direct_map = false; in paging32_init_context()
4478 /* Use 5-level TDP if and only if it's useful/necessary. */ in kvm_mmu_get_tdp_level()
4500 struct kvm_mmu *context = &vcpu->arch.root_mmu; in init_kvm_tdp_mmu()
4504 if (new_role.as_u64 == context->mmu_role.as_u64) in init_kvm_tdp_mmu()
4507 context->mmu_role.as_u64 = new_role.as_u64; in init_kvm_tdp_mmu()
4508 context->page_fault = kvm_tdp_page_fault; in init_kvm_tdp_mmu()
4509 context->sync_page = nonpaging_sync_page; in init_kvm_tdp_mmu()
4510 context->invlpg = NULL; in init_kvm_tdp_mmu()
4511 context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); in init_kvm_tdp_mmu()
4512 context->direct_map = true; in init_kvm_tdp_mmu()
4513 context->get_guest_pgd = get_cr3; in init_kvm_tdp_mmu()
4514 context->get_pdptr = kvm_pdptr_read; in init_kvm_tdp_mmu()
4515 context->inject_page_fault = kvm_inject_page_fault; in init_kvm_tdp_mmu()
4518 context->nx = false; in init_kvm_tdp_mmu()
4519 context->gva_to_gpa = nonpaging_gva_to_gpa; in init_kvm_tdp_mmu()
4520 context->root_level = 0; in init_kvm_tdp_mmu()
4522 context->nx = is_nx(vcpu); in init_kvm_tdp_mmu()
4523 context->root_level = is_la57_mode(vcpu) ? in init_kvm_tdp_mmu()
4526 context->gva_to_gpa = paging64_gva_to_gpa; in init_kvm_tdp_mmu()
4528 context->nx = is_nx(vcpu); in init_kvm_tdp_mmu()
4529 context->root_level = PT32E_ROOT_LEVEL; in init_kvm_tdp_mmu()
4531 context->gva_to_gpa = paging64_gva_to_gpa; in init_kvm_tdp_mmu()
4533 context->nx = false; in init_kvm_tdp_mmu()
4534 context->root_level = PT32_ROOT_LEVEL; in init_kvm_tdp_mmu()
4536 context->gva_to_gpa = paging32_gva_to_gpa; in init_kvm_tdp_mmu()
4590 context->mmu_role.as_u64 = new_role.as_u64; in shadow_mmu_init_context()
4596 struct kvm_mmu *context = &vcpu->arch.root_mmu; in kvm_init_shadow_mmu()
4600 if (new_role.as_u64 != context->mmu_role.as_u64) in kvm_init_shadow_mmu()
4619 struct kvm_mmu *context = &vcpu->arch.guest_mmu; in kvm_init_shadow_npt_mmu()
4624 if (new_role.as_u64 != context->mmu_role.as_u64) { in kvm_init_shadow_npt_mmu()
4631 context->shadow_root_level = new_role.base.level; in kvm_init_shadow_npt_mmu()
4643 role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; in kvm_calc_shadow_ept_root_page_role()
4653 * WP=1 and NOT_WP=1 is an impossible combination, use WP and the in kvm_calc_shadow_ept_root_page_role()
4668 struct kvm_mmu *context = &vcpu->arch.guest_mmu; in kvm_init_shadow_ept_mmu()
4676 if (new_role.as_u64 == context->mmu_role.as_u64) in kvm_init_shadow_ept_mmu()
4679 context->shadow_root_level = level; in kvm_init_shadow_ept_mmu()
4681 context->nx = true; in kvm_init_shadow_ept_mmu()
4682 context->ept_ad = accessed_dirty; in kvm_init_shadow_ept_mmu()
4683 context->page_fault = ept_page_fault; in kvm_init_shadow_ept_mmu()
4684 context->gva_to_gpa = ept_gva_to_gpa; in kvm_init_shadow_ept_mmu()
4685 context->sync_page = ept_sync_page; in kvm_init_shadow_ept_mmu()
4686 context->invlpg = ept_invlpg; in kvm_init_shadow_ept_mmu()
4687 context->root_level = level; in kvm_init_shadow_ept_mmu()
4688 context->direct_map = false; in kvm_init_shadow_ept_mmu()
4689 context->mmu_role.as_u64 = new_role.as_u64; in kvm_init_shadow_ept_mmu()
4701 struct kvm_mmu *context = &vcpu->arch.root_mmu; in init_kvm_softmmu()
4706 vcpu->arch.efer); in init_kvm_softmmu()
4708 context->get_guest_pgd = get_cr3; in init_kvm_softmmu()
4709 context->get_pdptr = kvm_pdptr_read; in init_kvm_softmmu()
4710 context->inject_page_fault = kvm_inject_page_fault; in init_kvm_softmmu()
4718 * Nested MMUs are used only for walking L2's gva->gpa, they never have in kvm_calc_nested_mmu_role()
4740 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; in init_kvm_nested_mmu()
4742 if (new_role.as_u64 == g_context->mmu_role.as_u64) in init_kvm_nested_mmu()
4745 g_context->mmu_role.as_u64 = new_role.as_u64; in init_kvm_nested_mmu()
4746 g_context->get_guest_pgd = get_cr3; in init_kvm_nested_mmu()
4747 g_context->get_pdptr = kvm_pdptr_read; in init_kvm_nested_mmu()
4748 g_context->inject_page_fault = kvm_inject_page_fault; in init_kvm_nested_mmu()
4754 g_context->invlpg = NULL; in init_kvm_nested_mmu()
4757 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using in init_kvm_nested_mmu()
4765 g_context->nx = false; in init_kvm_nested_mmu()
4766 g_context->root_level = 0; in init_kvm_nested_mmu()
4767 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; in init_kvm_nested_mmu()
4769 g_context->nx = is_nx(vcpu); in init_kvm_nested_mmu()
4770 g_context->root_level = is_la57_mode(vcpu) ? in init_kvm_nested_mmu()
4773 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; in init_kvm_nested_mmu()
4775 g_context->nx = is_nx(vcpu); in init_kvm_nested_mmu()
4776 g_context->root_level = PT32E_ROOT_LEVEL; in init_kvm_nested_mmu()
4778 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; in init_kvm_nested_mmu()
4780 g_context->nx = false; in init_kvm_nested_mmu()
4781 g_context->root_level = PT32_ROOT_LEVEL; in init_kvm_nested_mmu()
4783 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; in init_kvm_nested_mmu()
4796 vcpu->arch.mmu->root_hpa = INVALID_PAGE; in kvm_init_mmu()
4799 vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; in kvm_init_mmu()
4835 r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); in kvm_mmu_load()
4851 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); in kvm_mmu_unload()
4852 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa)); in kvm_mmu_unload()
4853 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); in kvm_mmu_unload()
4854 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa)); in kvm_mmu_unload()
4883 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ in mmu_pte_write_fetch_gpte()
4904 * Skip write-flooding detected for the sp whose level is 1, because in detect_write_flooding()
4905 * it can become unsync, then the guest page is not write-protected. in detect_write_flooding()
4907 if (sp->role.level == PG_LEVEL_4K) in detect_write_flooding()
4910 atomic_inc(&sp->write_flooding_count); in detect_write_flooding()
4911 return atomic_read(&sp->write_flooding_count) >= 3; in detect_write_flooding()
4924 gpa, bytes, sp->role.word); in detect_write_misaligned()
4927 pte_size = sp->role.gpte_is_8_bytes ? 8 : 4; in detect_write_misaligned()
4933 if (!(offset & (pte_size - 1)) && bytes == 1) in detect_write_misaligned()
4936 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); in detect_write_misaligned()
4949 level = sp->role.level; in get_written_sptes()
4951 if (!sp->role.gpte_is_8_bytes) { in get_written_sptes()
4952 page_offset <<= 1; /* 32->64 */ in get_written_sptes()
4954 * A 32-bit pde maps 4MB while the shadow pdes map in get_written_sptes()
4965 if (quadrant != sp->role.quadrant) in get_written_sptes()
4969 spte = &sp->spt[page_offset / sizeof(*spte)]; in get_written_sptes()
4986 * write-protected, so we can exit simply. in kvm_mmu_pte_write()
4988 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) in kvm_mmu_pte_write()
5002 spin_lock(&vcpu->kvm->mmu_lock); in kvm_mmu_pte_write()
5006 ++vcpu->kvm->stat.mmu_pte_write; in kvm_mmu_pte_write()
5009 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { in kvm_mmu_pte_write()
5012 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); in kvm_mmu_pte_write()
5013 ++vcpu->kvm->stat.mmu_flooded; in kvm_mmu_pte_write()
5022 while (npte--) { in kvm_mmu_pte_write()
5024 mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); in kvm_mmu_pte_write()
5025 if (gentry && sp->role.level != PG_LEVEL_4K) in kvm_mmu_pte_write()
5026 ++vcpu->kvm->stat.mmu_pde_zapped; in kvm_mmu_pte_write()
5034 spin_unlock(&vcpu->kvm->mmu_lock); in kvm_mmu_pte_write()
5042 if (vcpu->arch.mmu->direct_map) in kvm_mmu_unprotect_page_virt()
5047 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); in kvm_mmu_unprotect_page_virt()
5057 bool direct = vcpu->arch.mmu->direct_map; in kvm_mmu_page_fault()
5059 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) in kvm_mmu_page_fault()
5073 return -EIO; in kvm_mmu_page_fault()
5088 if (vcpu->arch.mmu->direct_map && in kvm_mmu_page_fault()
5090 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); in kvm_mmu_page_fault()
5095 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still in kvm_mmu_page_fault()
5097 * re-execute the instruction that caused the page fault. Do not allow in kvm_mmu_page_fault()
5100 * faulting on the non-existent MMIO address. Retrying an instruction in kvm_mmu_page_fault()
5118 /* It's actually a GPA for vcpu->arch.guest_mmu. */ in kvm_mmu_invalidate_gva()
5119 if (mmu != &vcpu->arch.guest_mmu) { in kvm_mmu_invalidate_gva()
5120 /* INVLPG on a non-canonical address is a NOP according to the SDM. */ in kvm_mmu_invalidate_gva()
5127 if (!mmu->invlpg) in kvm_mmu_invalidate_gva()
5131 mmu->invlpg(vcpu, gva, mmu->root_hpa); in kvm_mmu_invalidate_gva()
5145 if (VALID_PAGE(mmu->prev_roots[i].hpa)) in kvm_mmu_invalidate_gva()
5146 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); in kvm_mmu_invalidate_gva()
5148 mmu->invlpg(vcpu, gva, root_hpa); in kvm_mmu_invalidate_gva()
5155 kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); in kvm_mmu_invlpg()
5156 ++vcpu->stat.invlpg; in kvm_mmu_invlpg()
5163 struct kvm_mmu *mmu = vcpu->arch.mmu; in kvm_mmu_invpcid_gva()
5168 mmu->invlpg(vcpu, gva, mmu->root_hpa); in kvm_mmu_invpcid_gva()
5173 if (VALID_PAGE(mmu->prev_roots[i].hpa) && in kvm_mmu_invpcid_gva()
5174 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { in kvm_mmu_invpcid_gva()
5175 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); in kvm_mmu_invpcid_gva()
5183 ++vcpu->stat.invlpg; in kvm_mmu_invpcid_gva()
5218 /* The caller should hold mmu-lock before calling this function. */
5232 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { in slot_handle_level_range()
5236 iterator.gfn - start_gfn + 1); in slot_handle_level_range()
5239 cond_resched_lock(&kvm->mmu_lock); in slot_handle_level_range()
5245 end_gfn - start_gfn + 1); in slot_handle_level_range()
5258 end_level, memslot->base_gfn, in slot_handle_level()
5259 memslot->base_gfn + memslot->npages - 1, in slot_handle_level()
5289 free_page((unsigned long)mmu->pae_root); in free_mmu_pages()
5290 free_page((unsigned long)mmu->lm_root); in free_mmu_pages()
5298 mmu->root_hpa = INVALID_PAGE; in __kvm_mmu_create()
5299 mmu->root_pgd = 0; in __kvm_mmu_create()
5300 mmu->translate_gpa = translate_gpa; in __kvm_mmu_create()
5302 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; in __kvm_mmu_create()
5306 * while the PDP table is a per-vCPU construct that's allocated at MMU in __kvm_mmu_create()
5307 * creation. When emulating 32-bit mode, cr3 is only 32 bits even on in __kvm_mmu_create()
5311 * table. The main exception, handled here, is SVM's 32-bit NPT. The in __kvm_mmu_create()
5312 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit in __kvm_mmu_create()
5313 * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots(). in __kvm_mmu_create()
5320 return -ENOMEM; in __kvm_mmu_create()
5322 mmu->pae_root = page_address(page); in __kvm_mmu_create()
5324 mmu->pae_root[i] = INVALID_PAGE; in __kvm_mmu_create()
5333 vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; in kvm_mmu_create()
5334 vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; in kvm_mmu_create()
5336 vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; in kvm_mmu_create()
5337 vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; in kvm_mmu_create()
5339 vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; in kvm_mmu_create()
5341 vcpu->arch.mmu = &vcpu->arch.root_mmu; in kvm_mmu_create()
5342 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; in kvm_mmu_create()
5344 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; in kvm_mmu_create()
5346 ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); in kvm_mmu_create()
5350 ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); in kvm_mmu_create()
5356 free_mmu_pages(&vcpu->arch.guest_mmu); in kvm_mmu_create()
5368 &kvm->arch.active_mmu_pages, link) { in kvm_zap_obsolete_pages()
5381 if (WARN_ON(sp->role.invalid)) in kvm_zap_obsolete_pages()
5391 cond_resched_lock(&kvm->mmu_lock)) { in kvm_zap_obsolete_pages()
5397 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) { in kvm_zap_obsolete_pages()
5408 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); in kvm_zap_obsolete_pages()
5412 * Fast invalidate all shadow pages and use lock-break technique
5417 * not use any resource of the being-deleted slot or all slots
5422 lockdep_assert_held(&kvm->slots_lock); in kvm_mmu_zap_all_fast()
5424 spin_lock(&kvm->mmu_lock); in kvm_mmu_zap_all_fast()
5434 kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; in kvm_mmu_zap_all_fast()
5448 if (kvm->arch.tdp_mmu_enabled) in kvm_mmu_zap_all_fast()
5451 spin_unlock(&kvm->mmu_lock); in kvm_mmu_zap_all_fast()
5456 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); in kvm_has_zapped_obsolete_pages()
5468 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; in kvm_mmu_init_vm()
5472 node->track_write = kvm_mmu_pte_write; in kvm_mmu_init_vm()
5473 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; in kvm_mmu_init_vm()
5479 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; in kvm_mmu_uninit_vm()
5493 spin_lock(&kvm->mmu_lock); in kvm_zap_gfn_range()
5499 start = max(gfn_start, memslot->base_gfn); in kvm_zap_gfn_range()
5500 end = min(gfn_end, memslot->base_gfn + memslot->npages); in kvm_zap_gfn_range()
5507 start, end - 1, true); in kvm_zap_gfn_range()
5511 if (kvm->arch.tdp_mmu_enabled) { in kvm_zap_gfn_range()
5517 spin_unlock(&kvm->mmu_lock); in kvm_zap_gfn_range()
5532 spin_lock(&kvm->mmu_lock); in kvm_mmu_slot_remove_write_access()
5535 if (kvm->arch.tdp_mmu_enabled) in kvm_mmu_slot_remove_write_access()
5537 spin_unlock(&kvm->mmu_lock); in kvm_mmu_slot_remove_write_access()
5575 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && in kvm_mmu_zap_collapsible_spte()
5581 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, in kvm_mmu_zap_collapsible_spte()
5582 KVM_PAGES_PER_HPAGE(sp->role.level)); in kvm_mmu_zap_collapsible_spte()
5596 /* FIXME: const-ify all uses of struct kvm_memory_slot. */ in kvm_mmu_zap_collapsible_sptes()
5597 spin_lock(&kvm->mmu_lock); in kvm_mmu_zap_collapsible_sptes()
5601 if (kvm->arch.tdp_mmu_enabled) in kvm_mmu_zap_collapsible_sptes()
5603 spin_unlock(&kvm->mmu_lock); in kvm_mmu_zap_collapsible_sptes()
5616 lockdep_assert_held(&kvm->slots_lock); in kvm_arch_flush_remote_tlbs_memslot()
5617 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, in kvm_arch_flush_remote_tlbs_memslot()
5618 memslot->npages); in kvm_arch_flush_remote_tlbs_memslot()
5626 spin_lock(&kvm->mmu_lock); in kvm_mmu_slot_leaf_clear_dirty()
5628 if (kvm->arch.tdp_mmu_enabled) in kvm_mmu_slot_leaf_clear_dirty()
5630 spin_unlock(&kvm->mmu_lock); in kvm_mmu_slot_leaf_clear_dirty()
5648 spin_lock(&kvm->mmu_lock); in kvm_mmu_slot_largepage_remove_write_access()
5651 if (kvm->arch.tdp_mmu_enabled) in kvm_mmu_slot_largepage_remove_write_access()
5653 spin_unlock(&kvm->mmu_lock); in kvm_mmu_slot_largepage_remove_write_access()
5665 spin_lock(&kvm->mmu_lock); in kvm_mmu_slot_set_dirty()
5667 if (kvm->arch.tdp_mmu_enabled) in kvm_mmu_slot_set_dirty()
5669 spin_unlock(&kvm->mmu_lock); in kvm_mmu_slot_set_dirty()
5682 spin_lock(&kvm->mmu_lock); in kvm_mmu_zap_all()
5684 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { in kvm_mmu_zap_all()
5685 if (WARN_ON(sp->role.invalid)) in kvm_mmu_zap_all()
5689 if (cond_resched_lock(&kvm->mmu_lock)) in kvm_mmu_zap_all()
5695 if (kvm->arch.tdp_mmu_enabled) in kvm_mmu_zap_all()
5698 spin_unlock(&kvm->mmu_lock); in kvm_mmu_zap_all()
5714 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); in kvm_mmu_invalidate_mmio_sptes()
5730 int nr_to_scan = sc->nr_to_scan; in mmu_shrink_scan()
5740 * Never scan more than sc->nr_to_scan VM instances. in mmu_shrink_scan()
5745 if (!nr_to_scan--) in mmu_shrink_scan()
5748 * n_used_mmu_pages is accessed without holding kvm->mmu_lock in mmu_shrink_scan()
5753 if (!kvm->arch.n_used_mmu_pages && in mmu_shrink_scan()
5757 idx = srcu_read_lock(&kvm->srcu); in mmu_shrink_scan()
5758 spin_lock(&kvm->mmu_lock); in mmu_shrink_scan()
5762 &kvm->arch.zapped_obsolete_pages); in mmu_shrink_scan()
5766 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); in mmu_shrink_scan()
5769 spin_unlock(&kvm->mmu_lock); in mmu_shrink_scan()
5770 srcu_read_unlock(&kvm->srcu, idx); in mmu_shrink_scan()
5774 * per-vm shrinkers cry out in mmu_shrink_scan()
5777 list_move_tail(&kvm->vm_list, &vm_list); in mmu_shrink_scan()
5809 * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT in kvm_set_mmio_spte_mask()
5811 * 52-bit physical addresses then there are no reserved PA bits in the in kvm_set_mmio_spte_mask()
5846 return -EINVAL; in set_nx_huge_pages()
5856 mutex_lock(&kvm->slots_lock); in set_nx_huge_pages()
5858 mutex_unlock(&kvm->slots_lock); in set_nx_huge_pages()
5860 wake_up_process(kvm->arch.nx_lpage_recovery_thread); in set_nx_huge_pages()
5870 int ret = -ENOMEM; in kvm_mmu_module_init()
5872 if (nx_huge_pages == -1) in kvm_mmu_module_init()
5930 nr_pages += memslot->npages; in kvm_mmu_calculate_default_mmu_pages()
5942 free_mmu_pages(&vcpu->arch.root_mmu); in kvm_mmu_destroy()
5943 free_mmu_pages(&vcpu->arch.guest_mmu); in kvm_mmu_destroy()
5972 wake_up_process(kvm->arch.nx_lpage_recovery_thread); in set_nx_huge_pages_recovery_ratio()
5989 rcu_idx = srcu_read_lock(&kvm->srcu); in kvm_recover_nx_lpages()
5990 spin_lock(&kvm->mmu_lock); in kvm_recover_nx_lpages()
5993 to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; in kvm_recover_nx_lpages()
5994 for ( ; to_zap; --to_zap) { in kvm_recover_nx_lpages()
5995 if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) in kvm_recover_nx_lpages()
6003 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, in kvm_recover_nx_lpages()
6006 WARN_ON_ONCE(!sp->lpage_disallowed); in kvm_recover_nx_lpages()
6007 if (sp->tdp_mmu_page) { in kvm_recover_nx_lpages()
6011 WARN_ON_ONCE(sp->lpage_disallowed); in kvm_recover_nx_lpages()
6014 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { in kvm_recover_nx_lpages()
6016 cond_resched_lock(&kvm->mmu_lock); in kvm_recover_nx_lpages()
6022 spin_unlock(&kvm->mmu_lock); in kvm_recover_nx_lpages()
6023 srcu_read_unlock(&kvm->srcu, rcu_idx); in kvm_recover_nx_lpages()
6029 ? start_time + 60 * HZ - get_jiffies_64() in get_nx_lpage_recovery_timeout()
6063 "kvm-nx-lpage-recovery", in kvm_mmu_post_init_vm()
6064 &kvm->arch.nx_lpage_recovery_thread); in kvm_mmu_post_init_vm()
6066 kthread_unpark(kvm->arch.nx_lpage_recovery_thread); in kvm_mmu_post_init_vm()
6073 if (kvm->arch.nx_lpage_recovery_thread) in kvm_mmu_pre_destroy_vm()
6074 kthread_stop(kvm->arch.nx_lpage_recovery_thread); in kvm_mmu_pre_destroy_vm()