mmu.c - OpenGrok cross reference for /kernel/linux/linux-5.10/arch/x86/kvm/mmu/mmu.c

Lines Matching +full:wp +full:- +full:content
1 // SPDX-License-Identifier: GPL-2.0-only
3  * Kernel-based Virtual Machine driver for Linux
5  * This module enables machines with Intel VT-x extensions to run virtual
59 static int __read_mostly nx_huge_pages = -1;
90  * When setting this variable to true it enables Two-Dimensional-Paging
92  * 1. the guest-virtual to guest-physical
93  * 2. while doing 1. it walks guest-physical to host-physical
120 		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
123 	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
124 						* PT32_LEVEL_BITS))) - 1))
127 	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
132 	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
134 	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
135 					    * PT32_LEVEL_BITS))) - 1))
192 	int ret = -ENOTSUPP;  in kvm_flush_remote_tlbs_with_range()
256 	gen = kvm_vcpu_memslots(vcpu)->generation;  in check_mmio_spte()
280 	return vcpu->arch.efer & EFER_NX;  in is_nx()
285 	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;  in pse36_gfn_delta()
328 	sp->clear_spte_count++;  in count_spte_clear()
338 	ssptep->spte_high = sspte.spte_high;  in __set_spte()
347 	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);  in __set_spte()
357 	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);  in __update_clear_spte_fast()
365 	ssptep->spte_high = sspte.spte_high;  in __update_clear_spte_fast()
377 	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);  in __update_clear_spte_slow()
378 	orig.spte_high = ssptep->spte_high;  in __update_clear_spte_slow()
379 	ssptep->spte_high = sspte.spte_high;  in __update_clear_spte_slow()
391  * we need to protect against in-progress updates of the spte.
394  * for the high part of the spte.  The race is fine for a present->non-present
395  * change (because the high part of the spte is ignored for non-present spte),
396  * but for a present->present change we must reread the spte.
398  * All such changes are done in two steps (present->non-present and
399  * non-present->present), hence it is enough to count the number of
400  * present->non-present updates: if it changed while reading the spte,
410 	count = sp->clear_spte_count;  in __get_spte_lockless()
413 	spte.spte_low = orig->spte_low;  in __get_spte_lockless()
416 	spte.spte_high = orig->spte_high;  in __get_spte_lockless()
419 	if (unlikely(spte.spte_low != orig->spte_low ||  in __get_spte_lockless()
420 	      count != sp->clear_spte_count))  in __get_spte_lockless()
434 	 * out of mmu-lock, it can ensure dirty bit is not lost,  in spte_has_volatile_bits()
491  * Whenever we overwrite a writable spte with a read-only one we
493  * will find a read-only spte, even though the writable spte
508 	 * For the spte updated out of mmu-lock is safe, since  in mmu_spte_update()
538  * Returns non-zero if the PTE was previously valid.
586 /* Restore an acc-track PTE back to a regular PTE */
613 		clear_bit((ffs(shadow_accessed_mask) - 1),  in mmu_spte_age()
633 	 * Prevent page table teardown by making any free-er wait during  in walk_shadow_page_lockless_begin()
640 	 * to vcpu->mode.  in walk_shadow_page_lockless_begin()
642 	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);  in walk_shadow_page_lockless_begin()
648 	 * Make sure the write to vcpu->mode is not reordered in front of  in walk_shadow_page_lockless_end()
652 	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);  in walk_shadow_page_lockless_end()
661 	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,  in mmu_topup_memory_caches()
665 	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,  in mmu_topup_memory_caches()
670 		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,  in mmu_topup_memory_caches()
675 	return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,  in mmu_topup_memory_caches()
681 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);  in mmu_free_memory_caches()
682 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);  in mmu_free_memory_caches()
683 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);  in mmu_free_memory_caches()
684 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);  in mmu_free_memory_caches()
689 	return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);  in mmu_alloc_pte_list_desc()
699 	if (!sp->role.direct)  in kvm_mmu_page_get_gfn()
700 		return sp->gfns[index];  in kvm_mmu_page_get_gfn()
702 	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));  in kvm_mmu_page_get_gfn()
707 	if (!sp->role.direct) {  in kvm_mmu_page_set_gfn()
708 		sp->gfns[index] = gfn;  in kvm_mmu_page_set_gfn()
715 				   sp->gfn,  in kvm_mmu_page_set_gfn()
729 	idx = gfn_to_index(gfn, slot->base_gfn, level);  in lpage_info_slot()
730 	return &slot->arch.lpage_info[level - 2][idx];  in lpage_info_slot()
741 		linfo->disallow_lpage += count;  in update_gfn_disallow_lpage_count()
742 		WARN_ON(linfo->disallow_lpage < 0);  in update_gfn_disallow_lpage_count()
753 	update_gfn_disallow_lpage_count(slot, gfn, -1);  in kvm_mmu_gfn_allow_lpage()
762 	kvm->arch.indirect_shadow_pages++;  in account_shadowed()
763 	gfn = sp->gfn;  in account_shadowed()
764 	slots = kvm_memslots_for_spte_role(kvm, sp->role);  in account_shadowed()
767 	/* the non-leaf shadow pages are keeping readonly. */  in account_shadowed()
768 	if (sp->role.level > PG_LEVEL_4K)  in account_shadowed()
777 	if (sp->lpage_disallowed)  in account_huge_nx_page()
780 	++kvm->stat.nx_lpage_splits;  in account_huge_nx_page()
781 	list_add_tail(&sp->lpage_disallowed_link,  in account_huge_nx_page()
782 		      &kvm->arch.lpage_disallowed_mmu_pages);  in account_huge_nx_page()
783 	sp->lpage_disallowed = true;  in account_huge_nx_page()
792 	kvm->arch.indirect_shadow_pages--;  in unaccount_shadowed()
793 	gfn = sp->gfn;  in unaccount_shadowed()
794 	slots = kvm_memslots_for_spte_role(kvm, sp->role);  in unaccount_shadowed()
796 	if (sp->role.level > PG_LEVEL_4K)  in unaccount_shadowed()
805 	--kvm->stat.nx_lpage_splits;  in unaccount_huge_nx_page()
806 	sp->lpage_disallowed = false;  in unaccount_huge_nx_page()
807 	list_del(&sp->lpage_disallowed_link);  in unaccount_huge_nx_page()
817 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)  in gfn_to_memslot_dirty_bitmap()
819 	if (no_dirty_log && slot->dirty_bitmap)  in gfn_to_memslot_dirty_bitmap()
828  * If the bit zero of rmap_head->val is clear, then it points to the only spte
829  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
842 	if (!rmap_head->val) {  in pte_list_add()
843 		rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);  in pte_list_add()
844 		rmap_head->val = (unsigned long)spte;  in pte_list_add()
845 	} else if (!(rmap_head->val & 1)) {  in pte_list_add()
846 		rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);  in pte_list_add()
848 		desc->sptes[0] = (u64 *)rmap_head->val;  in pte_list_add()
849 		desc->sptes[1] = spte;  in pte_list_add()
850 		rmap_head->val = (unsigned long)desc | 1;  in pte_list_add()
853 		rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);  in pte_list_add()
854 		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);  in pte_list_add()
855 		while (desc->sptes[PTE_LIST_EXT-1]) {  in pte_list_add()
858 			if (!desc->more) {  in pte_list_add()
859 				desc->more = mmu_alloc_pte_list_desc(vcpu);  in pte_list_add()
860 				desc = desc->more;  in pte_list_add()
863 			desc = desc->more;  in pte_list_add()
865 		for (i = 0; desc->sptes[i]; ++i)  in pte_list_add()
867 		desc->sptes[i] = spte;  in pte_list_add()
879 	for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)  in pte_list_desc_remove_entry()
881 	desc->sptes[i] = desc->sptes[j];  in pte_list_desc_remove_entry()
882 	desc->sptes[j] = NULL;  in pte_list_desc_remove_entry()
885 	if (!prev_desc && !desc->more)  in pte_list_desc_remove_entry()
886 		rmap_head->val = 0;  in pte_list_desc_remove_entry()
889 			prev_desc->more = desc->more;  in pte_list_desc_remove_entry()
891 			rmap_head->val = (unsigned long)desc->more | 1;  in pte_list_desc_remove_entry()
901 	if (!rmap_head->val) {  in __pte_list_remove()
902 		pr_err("%s: %p 0->BUG\n", __func__, spte);  in __pte_list_remove()
904 	} else if (!(rmap_head->val & 1)) {  in __pte_list_remove()
905 		rmap_printk("%s:  %p 1->0\n", __func__, spte);  in __pte_list_remove()
906 		if ((u64 *)rmap_head->val != spte) {  in __pte_list_remove()
907 			pr_err("%s:  %p 1->BUG\n", __func__, spte);  in __pte_list_remove()
910 		rmap_head->val = 0;  in __pte_list_remove()
912 		rmap_printk("%s:  %p many->many\n", __func__, spte);  in __pte_list_remove()
913 		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);  in __pte_list_remove()
916 			for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {  in __pte_list_remove()
917 				if (desc->sptes[i] == spte) {  in __pte_list_remove()
924 			desc = desc->more;  in __pte_list_remove()
926 		pr_err("%s: %p many->many\n", __func__, spte);  in __pte_list_remove()
942 	idx = gfn_to_index(gfn, slot->base_gfn, level);  in __gfn_to_rmap()
943 	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];  in __gfn_to_rmap()
952 	slots = kvm_memslots_for_spte_role(kvm, sp->role);  in gfn_to_rmap()
954 	return __gfn_to_rmap(gfn, sp->role.level, slot);  in gfn_to_rmap()
961 	mc = &vcpu->arch.mmu_pte_list_desc_cache;  in rmap_can_add()
971 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);  in rmap_add()
972 	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);  in rmap_add()
983 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);  in rmap_remove()
1010 	if (!rmap_head->val)  in rmap_get_first()
1013 	if (!(rmap_head->val & 1)) {  in rmap_get_first()
1014 		iter->desc = NULL;  in rmap_get_first()
1015 		sptep = (u64 *)rmap_head->val;  in rmap_get_first()
1019 	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);  in rmap_get_first()
1020 	iter->pos = 0;  in rmap_get_first()
1021 	sptep = iter->desc->sptes[iter->pos];  in rmap_get_first()
1036 	if (iter->desc) {  in rmap_get_next()
1037 		if (iter->pos < PTE_LIST_EXT - 1) {  in rmap_get_next()
1038 			++iter->pos;  in rmap_get_next()
1039 			sptep = iter->desc->sptes[iter->pos];  in rmap_get_next()
1044 		iter->desc = iter->desc->more;  in rmap_get_next()
1046 		if (iter->desc) {  in rmap_get_next()
1047 			iter->pos = 0;  in rmap_get_next()
1048 			/* desc->sptes[0] cannot be NULL */  in rmap_get_next()
1049 			sptep = iter->desc->sptes[iter->pos];  in rmap_get_next()
1074 		WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);  in __drop_large_spte()
1076 		--kvm->stat.lpages;  in __drop_large_spte()
1085 	if (__drop_large_spte(vcpu->kvm, sptep)) {  in drop_large_spte()
1088 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,  in drop_large_spte()
1089 			KVM_PAGES_PER_HPAGE(sp->role.level));  in drop_large_spte()
1094  * Write-protect on the specified @sptep, @pt_protect indicates whether
1095  * spte write-protection is caused by protecting shadow page table.
1099  * - for dirty logging, the spte can be set to writable at anytime if
1101  * - for spte protection, the spte can be writable only after unsync-ing
1160  *	- D bit on ad-enabled SPTEs, and
1161  *	- W bit on ad-disabled SPTEs.
1209  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1224 	if (kvm->arch.tdp_mmu_enabled)  in kvm_mmu_write_protect_pt_masked()
1226 				slot->base_gfn + gfn_offset, mask, true);  in kvm_mmu_write_protect_pt_masked()
1228 		rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),  in kvm_mmu_write_protect_pt_masked()
1233 		mask &= mask - 1;  in kvm_mmu_write_protect_pt_masked()
1238  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1239  * protect the page if the D-bit isn't supported.
1241  * @slot: slot to clear D-bit
1243  * @mask: indicates which pages we should clear D-bit
1245  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1253 	if (kvm->arch.tdp_mmu_enabled)  in kvm_mmu_clear_dirty_pt_masked()
1255 				slot->base_gfn + gfn_offset, mask, false);  in kvm_mmu_clear_dirty_pt_masked()
1257 		rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),  in kvm_mmu_clear_dirty_pt_masked()
1262 		mask &= mask - 1;  in kvm_mmu_clear_dirty_pt_masked()
1268  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1300 	if (kvm->arch.tdp_mmu_enabled)  in kvm_mmu_slot_gfn_write_protect()
1312 	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);  in rmap_write_protect()
1399 	iterator->level = level;  in rmap_walk_init_level()
1400 	iterator->gfn = iterator->start_gfn;  in rmap_walk_init_level()
1401 	iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);  in rmap_walk_init_level()
1402 	iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,  in rmap_walk_init_level()
1403 					   iterator->slot);  in rmap_walk_init_level()
1411 	iterator->slot = slot;  in slot_rmap_walk_init()
1412 	iterator->start_level = start_level;  in slot_rmap_walk_init()
1413 	iterator->end_level = end_level;  in slot_rmap_walk_init()
1414 	iterator->start_gfn = start_gfn;  in slot_rmap_walk_init()
1415 	iterator->end_gfn = end_gfn;  in slot_rmap_walk_init()
1417 	rmap_walk_init_level(iterator, iterator->start_level);  in slot_rmap_walk_init()
1422 	return !!iterator->rmap;  in slot_rmap_walk_okay()
1427 	if (++iterator->rmap <= iterator->end_rmap) {  in slot_rmap_walk_next()
1428 		iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));  in slot_rmap_walk_next()
1432 	if (++iterator->level > iterator->end_level) {  in slot_rmap_walk_next()
1433 		iterator->rmap = NULL;  in slot_rmap_walk_next()
1437 	rmap_walk_init_level(iterator, iterator->level);  in slot_rmap_walk_next()
1470 			hva_start = max(start, memslot->userspace_addr);  in kvm_handle_hva_range()
1471 			hva_end = min(end, memslot->userspace_addr +  in kvm_handle_hva_range()
1472 				      (memslot->npages << PAGE_SHIFT));  in kvm_handle_hva_range()
1477 			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.  in kvm_handle_hva_range()
1480 			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);  in kvm_handle_hva_range()
1484 						 gfn_start, gfn_end - 1,  in kvm_handle_hva_range()
1512 	if (kvm->arch.tdp_mmu_enabled)  in kvm_unmap_hva_range()
1524 	if (kvm->arch.tdp_mmu_enabled)  in kvm_set_spte_hva()
1567 	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);  in rmap_recycle()
1569 	kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);  in rmap_recycle()
1570 	kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,  in rmap_recycle()
1571 			KVM_PAGES_PER_HPAGE(sp->role.level));  in rmap_recycle()
1579 	if (kvm->arch.tdp_mmu_enabled)  in kvm_age_hva()
1590 	if (kvm->arch.tdp_mmu_enabled)  in kvm_test_age_hva()
1614  * kvm->arch.n_used_mmu_pages values.  We need a global,
1620 	kvm->arch.n_used_mmu_pages += nr;  in kvm_mod_used_mmu_pages()
1626 	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));  in kvm_mmu_free_page()
1627 	hlist_del(&sp->hash_link);  in kvm_mmu_free_page()
1628 	list_del(&sp->link);  in kvm_mmu_free_page()
1629 	free_page((unsigned long)sp->spt);  in kvm_mmu_free_page()
1630 	if (!sp->role.direct)  in kvm_mmu_free_page()
1631 		free_page((unsigned long)sp->gfns);  in kvm_mmu_free_page()
1646 	pte_list_add(vcpu, parent_pte, &sp->parent_ptes);  in mmu_page_add_parent_pte()
1652 	__pte_list_remove(parent_pte, &sp->parent_ptes);  in mmu_page_remove_parent_pte()
1666 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);  in kvm_mmu_alloc_page()
1667 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);  in kvm_mmu_alloc_page()
1669 		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);  in kvm_mmu_alloc_page()
1670 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);  in kvm_mmu_alloc_page()
1677 	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;  in kvm_mmu_alloc_page()
1678 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);  in kvm_mmu_alloc_page()
1679 	kvm_mod_used_mmu_pages(vcpu->kvm, +1);  in kvm_mmu_alloc_page()
1689 	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {  in kvm_mmu_mark_parents_unsync()
1700 	index = spte - sp->spt;  in mark_unsync()
1701 	if (__test_and_set_bit(index, sp->unsync_child_bitmap))  in mark_unsync()
1703 	if (sp->unsync_children++)  in mark_unsync()
1729 	if (sp->unsync)  in mmu_pages_add()
1730 		for (i=0; i < pvec->nr; i++)  in mmu_pages_add()
1731 			if (pvec->page[i].sp == sp)  in mmu_pages_add()
1734 	pvec->page[pvec->nr].sp = sp;  in mmu_pages_add()
1735 	pvec->page[pvec->nr].idx = idx;  in mmu_pages_add()
1736 	pvec->nr++;  in mmu_pages_add()
1737 	return (pvec->nr == KVM_PAGE_ARRAY_NR);  in mmu_pages_add()
1742 	--sp->unsync_children;  in clear_unsync_child_bit()
1743 	WARN_ON((int)sp->unsync_children < 0);  in clear_unsync_child_bit()
1744 	__clear_bit(idx, sp->unsync_child_bitmap);  in clear_unsync_child_bit()
1752 	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {  in __mmu_unsync_walk()
1754 		u64 ent = sp->spt[i];  in __mmu_unsync_walk()
1763 		if (child->unsync_children) {  in __mmu_unsync_walk()
1765 				return -ENOSPC;  in __mmu_unsync_walk()
1775 		} else if (child->unsync) {  in __mmu_unsync_walk()
1778 				return -ENOSPC;  in __mmu_unsync_walk()
1786 #define INVALID_INDEX (-1)
1791 	pvec->nr = 0;  in mmu_unsync_walk()
1792 	if (!sp->unsync_children)  in mmu_unsync_walk()
1801 	WARN_ON(!sp->unsync);  in kvm_unlink_unsync_page()
1803 	sp->unsync = 0;  in kvm_unlink_unsync_page()
1804 	--kvm->stat.mmu_unsync;  in kvm_unlink_unsync_page()
1819 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])	\
1820 		if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
1824 	return sp->role.cr0_wp && sp->role.smap_andnot_wp;  in is_ept_sp()
1827 /* @sp->gfn should be write-protected at the call site */
1831 	if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||  in __kvm_sync_page()
1832 	    vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {  in __kvm_sync_page()
1833 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);  in __kvm_sync_page()
1858 	if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))  in kvm_mmu_flush_or_zap()
1874 	return sp->role.invalid ||  in is_obsolete_sp()
1875 	       unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);  in is_obsolete_sp()
1881 	kvm_unlink_unsync_page(vcpu->kvm, sp);  in kvm_sync_page()
1885 /* @gfn should be write-protected at the call site */
1892 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {  in kvm_sync_pages()
1893 		if (!s->unsync)  in kvm_sync_pages()
1896 		WARN_ON(s->role.level != PG_LEVEL_4K);  in kvm_sync_pages()
1919 	for (n = i+1; n < pvec->nr; n++) {  in mmu_pages_next()
1920 		struct kvm_mmu_page *sp = pvec->page[n].sp;  in mmu_pages_next()
1921 		unsigned idx = pvec->page[n].idx;  in mmu_pages_next()
1922 		int level = sp->role.level;  in mmu_pages_next()
1924 		parents->idx[level-1] = idx;  in mmu_pages_next()
1928 		parents->parent[level-2] = sp;  in mmu_pages_next()
1940 	if (pvec->nr == 0)  in mmu_pages_first()
1943 	WARN_ON(pvec->page[0].idx != INVALID_INDEX);  in mmu_pages_first()
1945 	sp = pvec->page[0].sp;  in mmu_pages_first()
1946 	level = sp->role.level;  in mmu_pages_first()
1949 	parents->parent[level-2] = sp;  in mmu_pages_first()
1954 	parents->parent[level-1] = NULL;  in mmu_pages_first()
1964 		unsigned int idx = parents->idx[level];  in mmu_pages_clear_parents()
1965 		sp = parents->parent[level];  in mmu_pages_clear_parents()
1972 	} while (!sp->unsync_children);  in mmu_pages_clear_parents()
1989 			protected |= rmap_write_protect(vcpu, sp->gfn);  in mmu_sync_children()
1992 			kvm_flush_remote_tlbs(vcpu->kvm);  in mmu_sync_children()
2000 		if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {  in mmu_sync_children()
2002 			cond_resched_lock(&vcpu->kvm->mmu_lock);  in mmu_sync_children()
2012 	atomic_set(&sp->write_flooding_count,  0);  in __clear_sp_write_flooding_count()
2027 	bool direct_mmu = vcpu->arch.mmu->direct_map;  in kvm_mmu_get_page()
2037 	role = vcpu->arch.mmu->mmu_role.base;  in kvm_mmu_get_page()
2043 	if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {  in kvm_mmu_get_page()
2045 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;  in kvm_mmu_get_page()
2049 	sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];  in kvm_mmu_get_page()
2050 	for_each_valid_sp(vcpu->kvm, sp, sp_list) {  in kvm_mmu_get_page()
2051 		if (sp->gfn != gfn) {  in kvm_mmu_get_page()
2056 		if (!need_sync && sp->unsync)  in kvm_mmu_get_page()
2059 		if (sp->role.word != role.word)  in kvm_mmu_get_page()
2065 		if (sp->unsync) {  in kvm_mmu_get_page()
2076 		if (sp->unsync_children)  in kvm_mmu_get_page()
2086 	++vcpu->kvm->stat.mmu_cache_miss;  in kvm_mmu_get_page()
2090 	sp->gfn = gfn;  in kvm_mmu_get_page()
2091 	sp->role = role;  in kvm_mmu_get_page()
2092 	hlist_add_head(&sp->hash_link, sp_list);  in kvm_mmu_get_page()
2096 		 * otherwise the content of the synced shadow page may  in kvm_mmu_get_page()
2099 		account_shadowed(vcpu->kvm, sp);  in kvm_mmu_get_page()
2101 			kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);  in kvm_mmu_get_page()
2110 	if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)  in kvm_mmu_get_page()
2111 		vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;  in kvm_mmu_get_page()
2119 	iterator->addr = addr;  in shadow_walk_init_using_root()
2120 	iterator->shadow_addr = root;  in shadow_walk_init_using_root()
2121 	iterator->level = vcpu->arch.mmu->shadow_root_level;  in shadow_walk_init_using_root()
2123 	if (iterator->level == PT64_ROOT_4LEVEL &&  in shadow_walk_init_using_root()
2124 	    vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&  in shadow_walk_init_using_root()
2125 	    !vcpu->arch.mmu->direct_map)  in shadow_walk_init_using_root()
2126 		--iterator->level;  in shadow_walk_init_using_root()
2128 	if (iterator->level == PT32E_ROOT_LEVEL) {  in shadow_walk_init_using_root()
2130 		 * prev_root is currently only used for 64-bit hosts. So only  in shadow_walk_init_using_root()
2133 		BUG_ON(root != vcpu->arch.mmu->root_hpa);  in shadow_walk_init_using_root()
2135 		iterator->shadow_addr  in shadow_walk_init_using_root()
2136 			= vcpu->arch.mmu->pae_root[(addr >> 30) & 3];  in shadow_walk_init_using_root()
2137 		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;  in shadow_walk_init_using_root()
2138 		--iterator->level;  in shadow_walk_init_using_root()
2139 		if (!iterator->shadow_addr)  in shadow_walk_init_using_root()
2140 			iterator->level = 0;  in shadow_walk_init_using_root()
2147 	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,  in shadow_walk_init()
2153 	if (iterator->level < PG_LEVEL_4K)  in shadow_walk_okay()
2156 	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);  in shadow_walk_okay()
2157 	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;  in shadow_walk_okay()
2164 	if (is_last_spte(spte, iterator->level)) {  in __shadow_walk_next()
2165 		iterator->level = 0;  in __shadow_walk_next()
2169 	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;  in __shadow_walk_next()
2170 	--iterator->level;  in __shadow_walk_next()
2175 	__shadow_walk_next(iterator, *iterator->sptep);  in shadow_walk_next()
2185 	spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));  in link_shadow_page()
2191 	if (sp->unsync_children || sp->unsync)  in link_shadow_page()
2204 		 * sp's access: allow writable in the read-only sp,  in validate_direct_spte()
2209 		if (child->role.access == direct_access)  in validate_direct_spte()
2213 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);  in validate_direct_spte()
2217 /* Returns the number of zapped non-leaf child shadow pages. */
2226 		if (is_last_spte(pte, sp->role.level)) {  in mmu_page_zap_pte()
2229 				--kvm->stat.lpages;  in mmu_page_zap_pte()
2240 			    child->role.guest_mode && !child->parent_ptes.val)  in mmu_page_zap_pte()
2258 		zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);  in kvm_mmu_page_unlink_children()
2268 	while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))  in kvm_mmu_unlink_parents()
2280 	if (parent->role.level == PG_LEVEL_4K)  in mmu_zap_unsync_children()
2304 	++kvm->stat.mmu_shadow_zapped;  in __kvm_mmu_prepare_zap_page()
2312 	if (!sp->role.invalid && !sp->role.direct)  in __kvm_mmu_prepare_zap_page()
2315 	if (sp->unsync)  in __kvm_mmu_prepare_zap_page()
2317 	if (!sp->root_count) {  in __kvm_mmu_prepare_zap_page()
2324 		 * !sp->root_count.  in __kvm_mmu_prepare_zap_page()
2326 		if (sp->role.invalid)  in __kvm_mmu_prepare_zap_page()
2327 			list_add(&sp->link, invalid_list);  in __kvm_mmu_prepare_zap_page()
2329 			list_move(&sp->link, invalid_list);  in __kvm_mmu_prepare_zap_page()
2330 		kvm_mod_used_mmu_pages(kvm, -1);  in __kvm_mmu_prepare_zap_page()
2336 		list_del(&sp->link);  in __kvm_mmu_prepare_zap_page()
2347 	if (sp->lpage_disallowed)  in __kvm_mmu_prepare_zap_page()
2350 	sp->role.invalid = 1;  in __kvm_mmu_prepare_zap_page()
2373 	 * the page tables and see changes to vcpu->mode here. The barrier  in kvm_mmu_commit_zap_page()
2383 		WARN_ON(!sp->role.invalid || sp->root_count);  in kvm_mmu_commit_zap_page()
2397 	if (list_empty(&kvm->arch.active_mmu_pages))  in kvm_mmu_zap_oldest_mmu_pages()
2401 	list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {  in kvm_mmu_zap_oldest_mmu_pages()
2406 		if (sp->root_count)  in kvm_mmu_zap_oldest_mmu_pages()
2421 	kvm->stat.mmu_recycled += total_zapped;  in kvm_mmu_zap_oldest_mmu_pages()
2427 	if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)  in kvm_mmu_available_pages()
2428 		return kvm->arch.n_max_mmu_pages -  in kvm_mmu_available_pages()
2429 			kvm->arch.n_used_mmu_pages;  in kvm_mmu_available_pages()
2436 	unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);  in make_mmu_pages_available()
2441 	kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);  in make_mmu_pages_available()
2443 	if (!kvm_mmu_available_pages(vcpu->kvm))  in make_mmu_pages_available()
2444 		return -ENOSPC;  in make_mmu_pages_available()
2454 	spin_lock(&kvm->mmu_lock);  in kvm_mmu_change_mmu_pages()
2456 	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {  in kvm_mmu_change_mmu_pages()
2457 		kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -  in kvm_mmu_change_mmu_pages()
2460 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;  in kvm_mmu_change_mmu_pages()
2463 	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;  in kvm_mmu_change_mmu_pages()
2465 	spin_unlock(&kvm->mmu_lock);  in kvm_mmu_change_mmu_pages()
2476 	spin_lock(&kvm->mmu_lock);  in kvm_mmu_unprotect_page()
2479 			 sp->role.word);  in kvm_mmu_unprotect_page()
2484 	spin_unlock(&kvm->mmu_lock);  in kvm_mmu_unprotect_page()
2493 	++vcpu->kvm->stat.mmu_unsync;  in kvm_unsync_page()
2494 	sp->unsync = 1;  in kvm_unsync_page()
2507 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {  in mmu_need_write_protect()
2511 		if (sp->unsync)  in mmu_need_write_protect()
2514 		WARN_ON(sp->role.level != PG_LEVEL_4K);  in mmu_need_write_protect()
2523 	 * before the page had been marked as unsync-ed, something like the  in mmu_need_write_protect()
2527 	 * ---------------------------------------------------------------------  in mmu_need_write_protect()
2540 	 *                      2.3 kvm_mmu_sync_pages() reads sp->unsync.  in mmu_need_write_protect()
2549 	 *     (sp->unsync = true)  in mmu_need_write_protect()
2616 			drop_spte(vcpu->kvm, sptep);  in mmu_set_spte()
2631 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,  in mmu_set_spte()
2649 		++vcpu->kvm->stat.lpages;  in mmu_set_spte()
2680 	unsigned int access = sp->role.access;  in direct_pte_prefetch_many()
2684 	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);  in direct_pte_prefetch_many()
2687 		return -1;  in direct_pte_prefetch_many()
2689 	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);  in direct_pte_prefetch_many()
2691 		return -1;  in direct_pte_prefetch_many()
2694 		mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,  in direct_pte_prefetch_many()
2708 	WARN_ON(!sp->role.direct);  in __direct_pte_prefetch()
2710 	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);  in __direct_pte_prefetch()
2711 	spte = sp->spt + i;  in __direct_pte_prefetch()
2739 	if (sp->role.level > PG_LEVEL_4K)  in direct_pte_prefetch()
2756 	 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()  in host_pfn_mapping_level()
2759 	 * read-only memslots due to gfn_to_hva() assuming writes.  Earlier  in host_pfn_mapping_level()
2761 	 * read-only memslot.  in host_pfn_mapping_level()
2765 	pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);  in host_pfn_mapping_level()
2795 	for ( ; max_level > PG_LEVEL_4K; max_level--) {  in kvm_mmu_hugepage_adjust()
2797 		if (!linfo->disallow_lpage)  in kvm_mmu_hugepage_adjust()
2821 	mask = KVM_PAGES_PER_HPAGE(level) - 1;  in kvm_mmu_hugepage_adjust()
2843 		u64 page_mask = KVM_PAGES_PER_HPAGE(level) -  in disallowed_hugepage_adjust()
2844 				KVM_PAGES_PER_HPAGE(level - 1);  in disallowed_hugepage_adjust()
2846 		(*goal_levelp)--;  in disallowed_hugepage_adjust()
2864 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))  in __direct_map()
2880 		base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);  in __direct_map()
2887 					      it.level - 1, true, ACC_ALL);  in __direct_map()
2892 				account_huge_nx_page(vcpu->kvm, sp);  in __direct_map()
2903 	++vcpu->stat.pf_fixed;  in __direct_map()
2927 	return -EFAULT;  in kvm_handle_bad_page()
2966 	 *    is caused by write-protect, that means we just need change the W  in page_fault_can_be_fast()
2967 	 *    bit of the spte which can be done out of mmu-lock.  in page_fault_can_be_fast()
2969 	 * However, if access tracking is disabled we know that a non-present  in page_fault_can_be_fast()
2990 	WARN_ON(!sp->role.direct);  in fast_pf_fix_direct_spte()
3000 	 * so non-PML cases won't be impacted.  in fast_pf_fix_direct_spte()
3010 		 * calculated by sp->gfn.  in fast_pf_fix_direct_spte()
3012 		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);  in fast_pf_fix_direct_spte()
3056 		if (!is_last_spte(spte, sp->role.level))  in fast_page_fault()
3080 		 * Currently, to simplify the code, write-protection can  in fast_page_fault()
3082 		 * write-protected for dirty-logging or access tracking.  in fast_page_fault()
3089 			 * Do not fix write-permission on the large spte.  Since  in fast_page_fault()
3090 			 * we only dirty the first page into the dirty-bitmap in  in fast_page_fault()
3099 			if (sp->role.level > PG_LEVEL_4K)  in fast_page_fault()
3145 		if (sp->tdp_mmu_page)  in mmu_free_root_page()
3147 		else if (sp->role.invalid)  in mmu_free_root_page()
3158 	struct kvm *kvm = vcpu->kvm;  in kvm_mmu_free_roots()
3166 	if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {  in kvm_mmu_free_roots()
3169 			    VALID_PAGE(mmu->prev_roots[i].hpa))  in kvm_mmu_free_roots()
3176 	spin_lock(&kvm->mmu_lock);  in kvm_mmu_free_roots()
3180 			mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,  in kvm_mmu_free_roots()
3184 		if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&  in kvm_mmu_free_roots()
3185 		    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {  in kvm_mmu_free_roots()
3186 			mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);  in kvm_mmu_free_roots()
3187 		} else if (mmu->pae_root) {  in kvm_mmu_free_roots()
3189 				if (mmu->pae_root[i] != 0)  in kvm_mmu_free_roots()
3191 							   &mmu->pae_root[i],  in kvm_mmu_free_roots()
3194 		mmu->root_hpa = INVALID_PAGE;  in kvm_mmu_free_roots()
3195 		mmu->root_pgd = 0;  in kvm_mmu_free_roots()
3199 	spin_unlock(&kvm->mmu_lock);  in kvm_mmu_free_roots()
3220 	spin_lock(&vcpu->kvm->mmu_lock);  in mmu_alloc_root()
3223 		spin_unlock(&vcpu->kvm->mmu_lock);  in mmu_alloc_root()
3227 	++sp->root_count;  in mmu_alloc_root()
3229 	spin_unlock(&vcpu->kvm->mmu_lock);  in mmu_alloc_root()
3230 	return __pa(sp->spt);  in mmu_alloc_root()
3235 	u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;  in mmu_alloc_direct_roots()
3239 	if (vcpu->kvm->arch.tdp_mmu_enabled) {  in mmu_alloc_direct_roots()
3243 			return -ENOSPC;  in mmu_alloc_direct_roots()
3244 		vcpu->arch.mmu->root_hpa = root;  in mmu_alloc_direct_roots()
3250 			return -ENOSPC;  in mmu_alloc_direct_roots()
3251 		vcpu->arch.mmu->root_hpa = root;  in mmu_alloc_direct_roots()
3254 			MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));  in mmu_alloc_direct_roots()
3256 			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),  in mmu_alloc_direct_roots()
3259 				return -ENOSPC;  in mmu_alloc_direct_roots()
3260 			vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;  in mmu_alloc_direct_roots()
3262 		vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);  in mmu_alloc_direct_roots()
3267 	vcpu->arch.mmu->root_pgd = 0;  in mmu_alloc_direct_roots()
3279 	root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);  in mmu_alloc_shadow_roots()
3287 	 * write-protect the guests page table root.  in mmu_alloc_shadow_roots()
3289 	if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {  in mmu_alloc_shadow_roots()
3290 		MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));  in mmu_alloc_shadow_roots()
3293 				      vcpu->arch.mmu->shadow_root_level, false);  in mmu_alloc_shadow_roots()
3295 			return -ENOSPC;  in mmu_alloc_shadow_roots()
3296 		vcpu->arch.mmu->root_hpa = root;  in mmu_alloc_shadow_roots()
3301 	 * We shadow a 32 bit page table. This may be a legacy 2-level  in mmu_alloc_shadow_roots()
3302 	 * or a PAE 3-level page table. In either case we need to be aware that  in mmu_alloc_shadow_roots()
3306 	if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {  in mmu_alloc_shadow_roots()
3310 		 * Allocate the page for the PDPTEs when shadowing 32-bit NPT  in mmu_alloc_shadow_roots()
3311 		 * with 64-bit only when needed.  Unlike 32-bit NPT, it doesn't  in mmu_alloc_shadow_roots()
3314 		if (!vcpu->arch.mmu->pae_root) {  in mmu_alloc_shadow_roots()
3317 			vcpu->arch.mmu->pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);  in mmu_alloc_shadow_roots()
3318 			if (!vcpu->arch.mmu->pae_root)  in mmu_alloc_shadow_roots()
3319 				return -ENOMEM;  in mmu_alloc_shadow_roots()
3324 		MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));  in mmu_alloc_shadow_roots()
3325 		if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {  in mmu_alloc_shadow_roots()
3326 			pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);  in mmu_alloc_shadow_roots()
3328 				vcpu->arch.mmu->pae_root[i] = 0;  in mmu_alloc_shadow_roots()
3339 			return -ENOSPC;  in mmu_alloc_shadow_roots()
3340 		vcpu->arch.mmu->pae_root[i] = root | pm_mask;  in mmu_alloc_shadow_roots()
3342 	vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);  in mmu_alloc_shadow_roots()
3345 	 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP  in mmu_alloc_shadow_roots()
3348 	 * on demand, as running a 32-bit L1 VMM is very rare.  The PDP is  in mmu_alloc_shadow_roots()
3351 	if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {  in mmu_alloc_shadow_roots()
3352 		if (vcpu->arch.mmu->lm_root == NULL) {  in mmu_alloc_shadow_roots()
3357 				return -ENOMEM;  in mmu_alloc_shadow_roots()
3359 			lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;  in mmu_alloc_shadow_roots()
3361 			vcpu->arch.mmu->lm_root = lm_root;  in mmu_alloc_shadow_roots()
3364 		vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);  in mmu_alloc_shadow_roots()
3368 	vcpu->arch.mmu->root_pgd = root_pgd;  in mmu_alloc_shadow_roots()
3375 	if (vcpu->arch.mmu->direct_map)  in mmu_alloc_roots()
3386 	if (vcpu->arch.mmu->direct_map)  in kvm_mmu_sync_roots()
3389 	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))  in kvm_mmu_sync_roots()
3394 	if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {  in kvm_mmu_sync_roots()
3395 		hpa_t root = vcpu->arch.mmu->root_hpa;  in kvm_mmu_sync_roots()
3399 		 * Even if another CPU was marking the SP as unsync-ed  in kvm_mmu_sync_roots()
3408 		if (!smp_load_acquire(&sp->unsync) &&  in kvm_mmu_sync_roots()
3409 		    !smp_load_acquire(&sp->unsync_children))  in kvm_mmu_sync_roots()
3412 		spin_lock(&vcpu->kvm->mmu_lock);  in kvm_mmu_sync_roots()
3418 		spin_unlock(&vcpu->kvm->mmu_lock);  in kvm_mmu_sync_roots()
3422 	spin_lock(&vcpu->kvm->mmu_lock);  in kvm_mmu_sync_roots()
3426 		hpa_t root = vcpu->arch.mmu->pae_root[i];  in kvm_mmu_sync_roots()
3436 	spin_unlock(&vcpu->kvm->mmu_lock);  in kvm_mmu_sync_roots()
3444 		exception->error_code = 0;  in nonpaging_gva_to_gpa()
3453 		exception->error_code = 0;  in nonpaging_gva_to_gpa_nested()
3454 	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);  in nonpaging_gva_to_gpa_nested()
3462 	return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];  in __is_rsvd_bits_set()
3467 	return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);  in __is_bad_mt_xwr()
3487  * That SPTE may be non-present.
3492 	int leaf = -1;  in get_walk()
3504 		sptes[leaf - 1] = spte;  in get_walk()
3523 	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) {  in get_mmio_spte()
3528 	if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))  in get_mmio_spte()
3538 	rsvd_check = &vcpu->arch.mmu->shadow_zero_check;  in get_mmio_spte()
3540 	for (level = root; level >= leaf; level--) {  in get_mmio_spte()
3541 		if (!is_shadow_present_pte(sptes[level - 1]))  in get_mmio_spte()
3544 		 * Use a bitwise-OR instead of a logical-OR to aggregate the  in get_mmio_spte()
3548 		reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) ||  in get_mmio_spte()
3549 			    __is_rsvd_bits_set(rsvd_check, sptes[level - 1],  in get_mmio_spte()
3556 		for (level = root; level >= leaf; level--)  in get_mmio_spte()
3557 			pr_err("------ spte 0x%llx level %d.\n",  in get_mmio_spte()
3558 			       sptes[level - 1], level);  in get_mmio_spte()
3561 	*sptep = sptes[leaf - 1];  in get_mmio_spte()
3576 		return -EINVAL;  in handle_mmio_page_fault()
3639 	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;  in kvm_arch_setup_async_pf()
3641 	arch.direct_map = vcpu->arch.mmu->direct_map;  in kvm_arch_setup_async_pf()
3642 	arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);  in kvm_arch_setup_async_pf()
3660 	if (slot && (slot->flags & KVM_MEMSLOT_INVALID))  in try_async_pf()
3703 	if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) {  in direct_page_fault()
3713 	mmu_seq = vcpu->kvm->mmu_notifier_seq;  in direct_page_fault()
3723 	spin_lock(&vcpu->kvm->mmu_lock);  in direct_page_fault()
3724 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))  in direct_page_fault()
3730 	if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))  in direct_page_fault()
3738 	spin_unlock(&vcpu->kvm->mmu_lock);  in direct_page_fault()
3757 	u32 flags = vcpu->arch.apf.host_apf_flags;  in kvm_handle_page_fault()
3760 	/* A 64-bit CR2 should be impossible on 32-bit KVM. */  in kvm_handle_page_fault()
3762 		return -EFAULT;  in kvm_handle_page_fault()
3765 	vcpu->arch.l1tf_flush_l1d = true;  in kvm_handle_page_fault()
3774 		vcpu->arch.apf.host_apf_flags = 0;  in kvm_handle_page_fault()
3793 	     max_level--) {  in kvm_tdp_page_fault()
3795 		gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);  in kvm_tdp_page_fault()
3808 	context->page_fault = nonpaging_page_fault;  in nonpaging_init_context()
3809 	context->gva_to_gpa = nonpaging_gva_to_gpa;  in nonpaging_init_context()
3810 	context->sync_page = nonpaging_sync_page;  in nonpaging_init_context()
3811 	context->invlpg = NULL;  in nonpaging_init_context()
3812 	context->root_level = 0;  in nonpaging_init_context()
3813 	context->shadow_root_level = PT32E_ROOT_LEVEL;  in nonpaging_init_context()
3814 	context->direct_map = true;  in nonpaging_init_context()
3815 	context->nx = false;  in nonpaging_init_context()
3821 	return (role.direct || pgd == root->pgd) &&  in is_root_usable()
3822 	       VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&  in is_root_usable()
3823 	       role.word == to_shadow_page(root->hpa)->role.word;  in is_root_usable()
3829  * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
3831  * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
3839 	struct kvm_mmu *mmu = vcpu->arch.mmu;  in cached_root_available()
3841 	root.pgd = mmu->root_pgd;  in cached_root_available()
3842 	root.hpa = mmu->root_hpa;  in cached_root_available()
3848 		swap(root, mmu->prev_roots[i]);  in cached_root_available()
3854 	mmu->root_hpa = root.hpa;  in cached_root_available()
3855 	mmu->root_pgd = root.pgd;  in cached_root_available()
3863 	struct kvm_mmu *mmu = vcpu->arch.mmu;  in fast_pgd_switch()
3866 	 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid  in fast_pgd_switch()
3867 	 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs  in fast_pgd_switch()
3870 	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&  in fast_pgd_switch()
3871 	    mmu->root_level >= PT64_ROOT_4LEVEL)  in fast_pgd_switch()
3882 		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);  in __kvm_mmu_new_pgd()
3901 	 * switching to a new CR3, that GVA->GPA mapping may no longer be  in __kvm_mmu_new_pgd()
3913 				to_shadow_page(vcpu->arch.mmu->root_hpa));  in __kvm_mmu_new_pgd()
3950 	 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.  in is_last_gpte()
3954 	gpte &= level - mmu->last_nonleaf_level;  in is_last_gpte()
3961 	gpte |= level - PG_LEVEL_4K - 1;  in is_last_gpte()
3989 	rsvd_check->bad_mt_xwr = 0;  in __reset_rsvds_bits_mask()
3997 	 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for  in __reset_rsvds_bits_mask()
4006 		rsvd_check->rsvd_bits_mask[0][1] = 0;  in __reset_rsvds_bits_mask()
4007 		rsvd_check->rsvd_bits_mask[0][0] = 0;  in __reset_rsvds_bits_mask()
4008 		rsvd_check->rsvd_bits_mask[1][0] =  in __reset_rsvds_bits_mask()
4009 			rsvd_check->rsvd_bits_mask[0][0];  in __reset_rsvds_bits_mask()
4012 			rsvd_check->rsvd_bits_mask[1][1] = 0;  in __reset_rsvds_bits_mask()
4018 			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);  in __reset_rsvds_bits_mask()
4021 			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);  in __reset_rsvds_bits_mask()
4024 		rsvd_check->rsvd_bits_mask[0][2] =  in __reset_rsvds_bits_mask()
4027 		rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |  in __reset_rsvds_bits_mask()
4029 		rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |  in __reset_rsvds_bits_mask()
4031 		rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |  in __reset_rsvds_bits_mask()
4034 		rsvd_check->rsvd_bits_mask[1][0] =  in __reset_rsvds_bits_mask()
4035 			rsvd_check->rsvd_bits_mask[0][0];  in __reset_rsvds_bits_mask()
4038 		rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |  in __reset_rsvds_bits_mask()
4041 		rsvd_check->rsvd_bits_mask[1][4] =  in __reset_rsvds_bits_mask()
4042 			rsvd_check->rsvd_bits_mask[0][4];  in __reset_rsvds_bits_mask()
4045 		rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |  in __reset_rsvds_bits_mask()
4048 		rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |  in __reset_rsvds_bits_mask()
4051 		rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |  in __reset_rsvds_bits_mask()
4053 		rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |  in __reset_rsvds_bits_mask()
4055 		rsvd_check->rsvd_bits_mask[1][3] =  in __reset_rsvds_bits_mask()
4056 			rsvd_check->rsvd_bits_mask[0][3];  in __reset_rsvds_bits_mask()
4057 		rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |  in __reset_rsvds_bits_mask()
4060 		rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |  in __reset_rsvds_bits_mask()
4063 		rsvd_check->rsvd_bits_mask[1][0] =  in __reset_rsvds_bits_mask()
4064 			rsvd_check->rsvd_bits_mask[0][0];  in __reset_rsvds_bits_mask()
4072 	__reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,  in reset_rsvds_bits_mask()
4073 				cpuid_maxphyaddr(vcpu), context->root_level,  in reset_rsvds_bits_mask()
4074 				context->nx,  in reset_rsvds_bits_mask()
4086 	rsvd_check->rsvd_bits_mask[0][4] =  in __reset_rsvds_bits_mask_ept()
4088 	rsvd_check->rsvd_bits_mask[0][3] =  in __reset_rsvds_bits_mask_ept()
4090 	rsvd_check->rsvd_bits_mask[0][2] =  in __reset_rsvds_bits_mask_ept()
4092 	rsvd_check->rsvd_bits_mask[0][1] =  in __reset_rsvds_bits_mask_ept()
4094 	rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);  in __reset_rsvds_bits_mask_ept()
4097 	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];  in __reset_rsvds_bits_mask_ept()
4098 	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];  in __reset_rsvds_bits_mask_ept()
4099 	rsvd_check->rsvd_bits_mask[1][2] =  in __reset_rsvds_bits_mask_ept()
4101 	rsvd_check->rsvd_bits_mask[1][1] =  in __reset_rsvds_bits_mask_ept()
4103 	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];  in __reset_rsvds_bits_mask_ept()
4114 	rsvd_check->bad_mt_xwr = bad_mt_xwr;  in __reset_rsvds_bits_mask_ept()
4120 	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,  in reset_rsvds_bits_mask_ept()
4134 	 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and  in reset_shadow_zero_bits_mask()
4135 	 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.  in reset_shadow_zero_bits_mask()
4136 	 * The iTLB multi-hit workaround can be toggled at any time, so assume  in reset_shadow_zero_bits_mask()
4137 	 * NX can be used by any non-nested shadow MMU to avoid having to reset  in reset_shadow_zero_bits_mask()
4140 	bool uses_nx = context->nx || !tdp_enabled ||  in reset_shadow_zero_bits_mask()
4141 		context->mmu_role.base.smep_andnot_wp;  in reset_shadow_zero_bits_mask()
4149 	shadow_zero_check = &context->shadow_zero_check;  in reset_shadow_zero_bits_mask()
4152 				context->shadow_root_level, uses_nx,  in reset_shadow_zero_bits_mask()
4159 	for (i = context->shadow_root_level; --i >= 0;) {  in reset_shadow_zero_bits_mask()
4160 		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;  in reset_shadow_zero_bits_mask()
4161 		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;  in reset_shadow_zero_bits_mask()
4175  * possible, however, kvm currently does not do execution-protection.
4184 	shadow_zero_check = &context->shadow_zero_check;  in reset_tdp_shadow_zero_bits_mask()
4189 					context->shadow_root_level, false,  in reset_tdp_shadow_zero_bits_mask()
4200 	for (i = context->shadow_root_level; --i >= 0;) {  in reset_tdp_shadow_zero_bits_mask()
4201 		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;  in reset_tdp_shadow_zero_bits_mask()
4202 		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;  in reset_tdp_shadow_zero_bits_mask()
4214 	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,  in reset_ept_shadow_zero_bits_mask()
4241 	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {  in update_permission_bitmask()
4249 		/* Faults from writes to non-writable pages */  in update_permission_bitmask()
4253 		/* Faults from fetches of non-executable pages*/  in update_permission_bitmask()
4265 			if (!mmu->nx)  in update_permission_bitmask()
4268 			/* Allow supervisor writes if !cr0.wp */  in update_permission_bitmask()
4277 			 * SMAP:kernel-mode data accesses from user-mode  in update_permission_bitmask()
4281 			 *   - X86_CR4_SMAP is set in CR4  in update_permission_bitmask()
4282 			 *   - A user page is accessed  in update_permission_bitmask()
4283 			 *   - The access is not a fetch  in update_permission_bitmask()
4284 			 *   - Page fault in kernel mode  in update_permission_bitmask()
4285 			 *   - if CPL = 3 or X86_EFLAGS_AC is clear  in update_permission_bitmask()
4296 		mmu->permissions[byte] = ff | uf | wf | smepf | smapf;  in update_permission_bitmask()
4302 * user-mode addresses based on the value in the PKRU register.  Protection
4311 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4312 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4313 * - PK is always zero if U=0 in the page tables
4314 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4328 	bool wp;  in update_pkru_bitmask()  local
4331 		mmu->pkru_mask = 0;  in update_pkru_bitmask()
4337 		mmu->pkru_mask = 0;  in update_pkru_bitmask()
4341 	wp = is_write_protection(vcpu);  in update_pkru_bitmask()
4343 	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {  in update_pkru_bitmask()
4362 		 * user access or CR0.WP = 1.  in update_pkru_bitmask()
4364 		check_write = check_pkey && wf && (uf || wp);  in update_pkru_bitmask()
4371 		mmu->pkru_mask |= (pkey_bits & 3) << pfec;  in update_pkru_bitmask()
4377 	unsigned root_level = mmu->root_level;  in update_last_nonleaf_level()
4379 	mmu->last_nonleaf_level = root_level;  in update_last_nonleaf_level()
4381 		mmu->last_nonleaf_level++;  in update_last_nonleaf_level()
4388 	context->nx = is_nx(vcpu);  in paging64_init_context_common()
4389 	context->root_level = level;  in paging64_init_context_common()
4397 	context->page_fault = paging64_page_fault;  in paging64_init_context_common()
4398 	context->gva_to_gpa = paging64_gva_to_gpa;  in paging64_init_context_common()
4399 	context->sync_page = paging64_sync_page;  in paging64_init_context_common()
4400 	context->invlpg = paging64_invlpg;  in paging64_init_context_common()
4401 	context->shadow_root_level = level;  in paging64_init_context_common()
4402 	context->direct_map = false;  in paging64_init_context_common()
4417 	context->nx = false;  in paging32_init_context()
4418 	context->root_level = PT32_ROOT_LEVEL;  in paging32_init_context()
4425 	context->page_fault = paging32_page_fault;  in paging32_init_context()
4426 	context->gva_to_gpa = paging32_gva_to_gpa;  in paging32_init_context()
4427 	context->sync_page = paging32_sync_page;  in paging32_init_context()
4428 	context->invlpg = paging32_invlpg;  in paging32_init_context()
4429 	context->shadow_root_level = PT32E_ROOT_LEVEL;  in paging32_init_context()
4430 	context->direct_map = false;  in paging32_init_context()
4478 	/* Use 5-level TDP if and only if it's useful/necessary. */  in kvm_mmu_get_tdp_level()
4500 	struct kvm_mmu *context = &vcpu->arch.root_mmu;  in init_kvm_tdp_mmu()
4504 	if (new_role.as_u64 == context->mmu_role.as_u64)  in init_kvm_tdp_mmu()
4507 	context->mmu_role.as_u64 = new_role.as_u64;  in init_kvm_tdp_mmu()
4508 	context->page_fault = kvm_tdp_page_fault;  in init_kvm_tdp_mmu()
4509 	context->sync_page = nonpaging_sync_page;  in init_kvm_tdp_mmu()
4510 	context->invlpg = NULL;  in init_kvm_tdp_mmu()
4511 	context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);  in init_kvm_tdp_mmu()
4512 	context->direct_map = true;  in init_kvm_tdp_mmu()
4513 	context->get_guest_pgd = get_cr3;  in init_kvm_tdp_mmu()
4514 	context->get_pdptr = kvm_pdptr_read;  in init_kvm_tdp_mmu()
4515 	context->inject_page_fault = kvm_inject_page_fault;  in init_kvm_tdp_mmu()
4518 		context->nx = false;  in init_kvm_tdp_mmu()
4519 		context->gva_to_gpa = nonpaging_gva_to_gpa;  in init_kvm_tdp_mmu()
4520 		context->root_level = 0;  in init_kvm_tdp_mmu()
4522 		context->nx = is_nx(vcpu);  in init_kvm_tdp_mmu()
4523 		context->root_level = is_la57_mode(vcpu) ?  in init_kvm_tdp_mmu()
4526 		context->gva_to_gpa = paging64_gva_to_gpa;  in init_kvm_tdp_mmu()
4528 		context->nx = is_nx(vcpu);  in init_kvm_tdp_mmu()
4529 		context->root_level = PT32E_ROOT_LEVEL;  in init_kvm_tdp_mmu()
4531 		context->gva_to_gpa = paging64_gva_to_gpa;  in init_kvm_tdp_mmu()
4533 		context->nx = false;  in init_kvm_tdp_mmu()
4534 		context->root_level = PT32_ROOT_LEVEL;  in init_kvm_tdp_mmu()
4536 		context->gva_to_gpa = paging32_gva_to_gpa;  in init_kvm_tdp_mmu()
4590 	context->mmu_role.as_u64 = new_role.as_u64;  in shadow_mmu_init_context()
4596 	struct kvm_mmu *context = &vcpu->arch.root_mmu;  in kvm_init_shadow_mmu()
4600 	if (new_role.as_u64 != context->mmu_role.as_u64)  in kvm_init_shadow_mmu()
4619 	struct kvm_mmu *context = &vcpu->arch.guest_mmu;  in kvm_init_shadow_npt_mmu()
4624 	if (new_role.as_u64 != context->mmu_role.as_u64) {  in kvm_init_shadow_npt_mmu()
4631 		context->shadow_root_level = new_role.base.level;  in kvm_init_shadow_npt_mmu()
4643 	role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;  in kvm_calc_shadow_ept_root_page_role()
4653 	 * WP=1 and NOT_WP=1 is an impossible combination, use WP and the  in kvm_calc_shadow_ept_root_page_role()
4668 	struct kvm_mmu *context = &vcpu->arch.guest_mmu;  in kvm_init_shadow_ept_mmu()
4676 	if (new_role.as_u64 == context->mmu_role.as_u64)  in kvm_init_shadow_ept_mmu()
4679 	context->shadow_root_level = level;  in kvm_init_shadow_ept_mmu()
4681 	context->nx = true;  in kvm_init_shadow_ept_mmu()
4682 	context->ept_ad = accessed_dirty;  in kvm_init_shadow_ept_mmu()
4683 	context->page_fault = ept_page_fault;  in kvm_init_shadow_ept_mmu()
4684 	context->gva_to_gpa = ept_gva_to_gpa;  in kvm_init_shadow_ept_mmu()
4685 	context->sync_page = ept_sync_page;  in kvm_init_shadow_ept_mmu()
4686 	context->invlpg = ept_invlpg;  in kvm_init_shadow_ept_mmu()
4687 	context->root_level = level;  in kvm_init_shadow_ept_mmu()
4688 	context->direct_map = false;  in kvm_init_shadow_ept_mmu()
4689 	context->mmu_role.as_u64 = new_role.as_u64;  in kvm_init_shadow_ept_mmu()
4701 	struct kvm_mmu *context = &vcpu->arch.root_mmu;  in init_kvm_softmmu()
4706 			    vcpu->arch.efer);  in init_kvm_softmmu()
4708 	context->get_guest_pgd     = get_cr3;  in init_kvm_softmmu()
4709 	context->get_pdptr         = kvm_pdptr_read;  in init_kvm_softmmu()
4710 	context->inject_page_fault = kvm_inject_page_fault;  in init_kvm_softmmu()
4718 	 * Nested MMUs are used only for walking L2's gva->gpa, they never have  in kvm_calc_nested_mmu_role()
4740 	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;  in init_kvm_nested_mmu()
4742 	if (new_role.as_u64 == g_context->mmu_role.as_u64)  in init_kvm_nested_mmu()
4745 	g_context->mmu_role.as_u64 = new_role.as_u64;  in init_kvm_nested_mmu()
4746 	g_context->get_guest_pgd     = get_cr3;  in init_kvm_nested_mmu()
4747 	g_context->get_pdptr         = kvm_pdptr_read;  in init_kvm_nested_mmu()
4748 	g_context->inject_page_fault = kvm_inject_page_fault;  in init_kvm_nested_mmu()
4754 	g_context->invlpg            = NULL;  in init_kvm_nested_mmu()
4757 	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using  in init_kvm_nested_mmu()
4765 		g_context->nx = false;  in init_kvm_nested_mmu()
4766 		g_context->root_level = 0;  in init_kvm_nested_mmu()
4767 		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;  in init_kvm_nested_mmu()
4769 		g_context->nx = is_nx(vcpu);  in init_kvm_nested_mmu()
4770 		g_context->root_level = is_la57_mode(vcpu) ?  in init_kvm_nested_mmu()
4773 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;  in init_kvm_nested_mmu()
4775 		g_context->nx = is_nx(vcpu);  in init_kvm_nested_mmu()
4776 		g_context->root_level = PT32E_ROOT_LEVEL;  in init_kvm_nested_mmu()
4778 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;  in init_kvm_nested_mmu()
4780 		g_context->nx = false;  in init_kvm_nested_mmu()
4781 		g_context->root_level = PT32_ROOT_LEVEL;  in init_kvm_nested_mmu()
4783 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;  in init_kvm_nested_mmu()
4796 		vcpu->arch.mmu->root_hpa = INVALID_PAGE;  in kvm_init_mmu()
4799 			vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;  in kvm_init_mmu()
4835 	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);  in kvm_mmu_load()
4851 	kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);  in kvm_mmu_unload()
4852 	WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));  in kvm_mmu_unload()
4853 	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);  in kvm_mmu_unload()
4854 	WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));  in kvm_mmu_unload()
4883 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */  in mmu_pte_write_fetch_gpte()
4904 	 * Skip write-flooding detected for the sp whose level is 1, because  in detect_write_flooding()
4905 	 * it can become unsync, then the guest page is not write-protected.  in detect_write_flooding()
4907 	if (sp->role.level == PG_LEVEL_4K)  in detect_write_flooding()
4910 	atomic_inc(&sp->write_flooding_count);  in detect_write_flooding()
4911 	return atomic_read(&sp->write_flooding_count) >= 3;  in detect_write_flooding()
4924 		 gpa, bytes, sp->role.word);  in detect_write_misaligned()
4927 	pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;  in detect_write_misaligned()
4933 	if (!(offset & (pte_size - 1)) && bytes == 1)  in detect_write_misaligned()
4936 	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);  in detect_write_misaligned()
4949 	level = sp->role.level;  in get_written_sptes()
4951 	if (!sp->role.gpte_is_8_bytes) {  in get_written_sptes()
4952 		page_offset <<= 1;	/* 32->64 */  in get_written_sptes()
4954 		 * A 32-bit pde maps 4MB while the shadow pdes map  in get_written_sptes()
4965 		if (quadrant != sp->role.quadrant)  in get_written_sptes()
4969 	spte = &sp->spt[page_offset / sizeof(*spte)];  in get_written_sptes()
4986 	 * write-protected, so we can exit simply.  in kvm_mmu_pte_write()
4988 	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))  in kvm_mmu_pte_write()
5002 	spin_lock(&vcpu->kvm->mmu_lock);  in kvm_mmu_pte_write()
5006 	++vcpu->kvm->stat.mmu_pte_write;  in kvm_mmu_pte_write()
5009 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {  in kvm_mmu_pte_write()
5012 			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);  in kvm_mmu_pte_write()
5013 			++vcpu->kvm->stat.mmu_flooded;  in kvm_mmu_pte_write()
5022 		while (npte--) {  in kvm_mmu_pte_write()
5024 			mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);  in kvm_mmu_pte_write()
5025 			if (gentry && sp->role.level != PG_LEVEL_4K)  in kvm_mmu_pte_write()
5026 				++vcpu->kvm->stat.mmu_pde_zapped;  in kvm_mmu_pte_write()
5034 	spin_unlock(&vcpu->kvm->mmu_lock);  in kvm_mmu_pte_write()
5042 	if (vcpu->arch.mmu->direct_map)  in kvm_mmu_unprotect_page_virt()
5047 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);  in kvm_mmu_unprotect_page_virt()
5057 	bool direct = vcpu->arch.mmu->direct_map;  in kvm_mmu_page_fault()
5059 	if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))  in kvm_mmu_page_fault()
5073 			return -EIO;  in kvm_mmu_page_fault()
5088 	if (vcpu->arch.mmu->direct_map &&  in kvm_mmu_page_fault()
5090 		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));  in kvm_mmu_page_fault()
5095 	 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still  in kvm_mmu_page_fault()
5097 	 * re-execute the instruction that caused the page fault.  Do not allow  in kvm_mmu_page_fault()
5100 	 * faulting on the non-existent MMIO address.  Retrying an instruction  in kvm_mmu_page_fault()
5118 	/* It's actually a GPA for vcpu->arch.guest_mmu.  */  in kvm_mmu_invalidate_gva()
5119 	if (mmu != &vcpu->arch.guest_mmu) {  in kvm_mmu_invalidate_gva()
5120 		/* INVLPG on a non-canonical address is a NOP according to the SDM.  */  in kvm_mmu_invalidate_gva()
5127 	if (!mmu->invlpg)  in kvm_mmu_invalidate_gva()
5131 		mmu->invlpg(vcpu, gva, mmu->root_hpa);  in kvm_mmu_invalidate_gva()
5145 			if (VALID_PAGE(mmu->prev_roots[i].hpa))  in kvm_mmu_invalidate_gva()
5146 				mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);  in kvm_mmu_invalidate_gva()
5148 		mmu->invlpg(vcpu, gva, root_hpa);  in kvm_mmu_invalidate_gva()
5155 	kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);  in kvm_mmu_invlpg()
5156 	++vcpu->stat.invlpg;  in kvm_mmu_invlpg()
5163 	struct kvm_mmu *mmu = vcpu->arch.mmu;  in kvm_mmu_invpcid_gva()
5168 		mmu->invlpg(vcpu, gva, mmu->root_hpa);  in kvm_mmu_invpcid_gva()
5173 		if (VALID_PAGE(mmu->prev_roots[i].hpa) &&  in kvm_mmu_invpcid_gva()
5174 		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {  in kvm_mmu_invpcid_gva()
5175 			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);  in kvm_mmu_invpcid_gva()
5183 	++vcpu->stat.invlpg;  in kvm_mmu_invpcid_gva()
5218 /* The caller should hold mmu-lock before calling this function. */
5232 		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {  in slot_handle_level_range()
5236 						iterator.gfn - start_gfn + 1);  in slot_handle_level_range()
5239 			cond_resched_lock(&kvm->mmu_lock);  in slot_handle_level_range()
5245 						   end_gfn - start_gfn + 1);  in slot_handle_level_range()
5258 			end_level, memslot->base_gfn,  in slot_handle_level()
5259 			memslot->base_gfn + memslot->npages - 1,  in slot_handle_level()
5289 	free_page((unsigned long)mmu->pae_root);  in free_mmu_pages()
5290 	free_page((unsigned long)mmu->lm_root);  in free_mmu_pages()
5298 	mmu->root_hpa = INVALID_PAGE;  in __kvm_mmu_create()
5299 	mmu->root_pgd = 0;  in __kvm_mmu_create()
5300 	mmu->translate_gpa = translate_gpa;  in __kvm_mmu_create()
5302 		mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;  in __kvm_mmu_create()
5306 	 * while the PDP table is a per-vCPU construct that's allocated at MMU  in __kvm_mmu_create()
5307 	 * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on  in __kvm_mmu_create()
5311 	 * table.  The main exception, handled here, is SVM's 32-bit NPT.  The  in __kvm_mmu_create()
5312 	 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit  in __kvm_mmu_create()
5313 	 * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().  in __kvm_mmu_create()
5320 		return -ENOMEM;  in __kvm_mmu_create()
5322 	mmu->pae_root = page_address(page);  in __kvm_mmu_create()
5324 		mmu->pae_root[i] = INVALID_PAGE;  in __kvm_mmu_create()
5333 	vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;  in kvm_mmu_create()
5334 	vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;  in kvm_mmu_create()
5336 	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;  in kvm_mmu_create()
5337 	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;  in kvm_mmu_create()
5339 	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;  in kvm_mmu_create()
5341 	vcpu->arch.mmu = &vcpu->arch.root_mmu;  in kvm_mmu_create()
5342 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;  in kvm_mmu_create()
5344 	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;  in kvm_mmu_create()
5346 	ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);  in kvm_mmu_create()
5350 	ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);  in kvm_mmu_create()
5356 	free_mmu_pages(&vcpu->arch.guest_mmu);  in kvm_mmu_create()
5368 	      &kvm->arch.active_mmu_pages, link) {  in kvm_zap_obsolete_pages()
5381 		if (WARN_ON(sp->role.invalid))  in kvm_zap_obsolete_pages()
5391 		    cond_resched_lock(&kvm->mmu_lock)) {  in kvm_zap_obsolete_pages()
5397 				&kvm->arch.zapped_obsolete_pages, &nr_zapped)) {  in kvm_zap_obsolete_pages()
5408 	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);  in kvm_zap_obsolete_pages()
5412  * Fast invalidate all shadow pages and use lock-break technique
5417  * not use any resource of the being-deleted slot or all slots
5422 	lockdep_assert_held(&kvm->slots_lock);  in kvm_mmu_zap_all_fast()
5424 	spin_lock(&kvm->mmu_lock);  in kvm_mmu_zap_all_fast()
5434 	kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;  in kvm_mmu_zap_all_fast()
5448 	if (kvm->arch.tdp_mmu_enabled)  in kvm_mmu_zap_all_fast()
5451 	spin_unlock(&kvm->mmu_lock);  in kvm_mmu_zap_all_fast()
5456 	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));  in kvm_has_zapped_obsolete_pages()
5468 	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;  in kvm_mmu_init_vm()
5472 	node->track_write = kvm_mmu_pte_write;  in kvm_mmu_init_vm()
5473 	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;  in kvm_mmu_init_vm()
5479 	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;  in kvm_mmu_uninit_vm()
5493 	spin_lock(&kvm->mmu_lock);  in kvm_zap_gfn_range()
5499 			start = max(gfn_start, memslot->base_gfn);  in kvm_zap_gfn_range()
5500 			end = min(gfn_end, memslot->base_gfn + memslot->npages);  in kvm_zap_gfn_range()
5507 						start, end - 1, true);  in kvm_zap_gfn_range()
5511 	if (kvm->arch.tdp_mmu_enabled) {  in kvm_zap_gfn_range()
5517 	spin_unlock(&kvm->mmu_lock);  in kvm_zap_gfn_range()
5532 	spin_lock(&kvm->mmu_lock);  in kvm_mmu_slot_remove_write_access()
5535 	if (kvm->arch.tdp_mmu_enabled)  in kvm_mmu_slot_remove_write_access()
5537 	spin_unlock(&kvm->mmu_lock);  in kvm_mmu_slot_remove_write_access()
5575 		if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&  in kvm_mmu_zap_collapsible_spte()
5581 				kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,  in kvm_mmu_zap_collapsible_spte()
5582 					KVM_PAGES_PER_HPAGE(sp->role.level));  in kvm_mmu_zap_collapsible_spte()
5596 	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */  in kvm_mmu_zap_collapsible_sptes()
5597 	spin_lock(&kvm->mmu_lock);  in kvm_mmu_zap_collapsible_sptes()
5601 	if (kvm->arch.tdp_mmu_enabled)  in kvm_mmu_zap_collapsible_sptes()
5603 	spin_unlock(&kvm->mmu_lock);  in kvm_mmu_zap_collapsible_sptes()
5616 	lockdep_assert_held(&kvm->slots_lock);  in kvm_arch_flush_remote_tlbs_memslot()
5617 	kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,  in kvm_arch_flush_remote_tlbs_memslot()
5618 					   memslot->npages);  in kvm_arch_flush_remote_tlbs_memslot()
5626 	spin_lock(&kvm->mmu_lock);  in kvm_mmu_slot_leaf_clear_dirty()
5628 	if (kvm->arch.tdp_mmu_enabled)  in kvm_mmu_slot_leaf_clear_dirty()
5630 	spin_unlock(&kvm->mmu_lock);  in kvm_mmu_slot_leaf_clear_dirty()
5648 	spin_lock(&kvm->mmu_lock);  in kvm_mmu_slot_largepage_remove_write_access()
5651 	if (kvm->arch.tdp_mmu_enabled)  in kvm_mmu_slot_largepage_remove_write_access()
5653 	spin_unlock(&kvm->mmu_lock);  in kvm_mmu_slot_largepage_remove_write_access()
5665 	spin_lock(&kvm->mmu_lock);  in kvm_mmu_slot_set_dirty()
5667 	if (kvm->arch.tdp_mmu_enabled)  in kvm_mmu_slot_set_dirty()
5669 	spin_unlock(&kvm->mmu_lock);  in kvm_mmu_slot_set_dirty()
5682 	spin_lock(&kvm->mmu_lock);  in kvm_mmu_zap_all()
5684 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {  in kvm_mmu_zap_all()
5685 		if (WARN_ON(sp->role.invalid))  in kvm_mmu_zap_all()
5689 		if (cond_resched_lock(&kvm->mmu_lock))  in kvm_mmu_zap_all()
5695 	if (kvm->arch.tdp_mmu_enabled)  in kvm_mmu_zap_all()
5698 	spin_unlock(&kvm->mmu_lock);  in kvm_mmu_zap_all()
5714 	gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);  in kvm_mmu_invalidate_mmio_sptes()
5730 	int nr_to_scan = sc->nr_to_scan;  in mmu_shrink_scan()
5740 		 * Never scan more than sc->nr_to_scan VM instances.  in mmu_shrink_scan()
5745 		if (!nr_to_scan--)  in mmu_shrink_scan()
5748 		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock  in mmu_shrink_scan()
5753 		if (!kvm->arch.n_used_mmu_pages &&  in mmu_shrink_scan()
5757 		idx = srcu_read_lock(&kvm->srcu);  in mmu_shrink_scan()
5758 		spin_lock(&kvm->mmu_lock);  in mmu_shrink_scan()
5762 			      &kvm->arch.zapped_obsolete_pages);  in mmu_shrink_scan()
5766 		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);  in mmu_shrink_scan()
5769 		spin_unlock(&kvm->mmu_lock);  in mmu_shrink_scan()
5770 		srcu_read_unlock(&kvm->srcu, idx);  in mmu_shrink_scan()
5774 		 * per-vm shrinkers cry out  in mmu_shrink_scan()
5777 		list_move_tail(&kvm->vm_list, &vm_list);  in mmu_shrink_scan()
5809 	 * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT  in kvm_set_mmio_spte_mask()
5811 	 * 52-bit physical addresses then there are no reserved PA bits in the  in kvm_set_mmio_spte_mask()
5846 		return -EINVAL;  in set_nx_huge_pages()
5856 			mutex_lock(&kvm->slots_lock);  in set_nx_huge_pages()
5858 			mutex_unlock(&kvm->slots_lock);  in set_nx_huge_pages()
5860 			wake_up_process(kvm->arch.nx_lpage_recovery_thread);  in set_nx_huge_pages()
5870 	int ret = -ENOMEM;  in kvm_mmu_module_init()
5872 	if (nx_huge_pages == -1)  in kvm_mmu_module_init()
5930 			nr_pages += memslot->npages;  in kvm_mmu_calculate_default_mmu_pages()
5942 	free_mmu_pages(&vcpu->arch.root_mmu);  in kvm_mmu_destroy()
5943 	free_mmu_pages(&vcpu->arch.guest_mmu);  in kvm_mmu_destroy()
5972 			wake_up_process(kvm->arch.nx_lpage_recovery_thread);  in set_nx_huge_pages_recovery_ratio()
5989 	rcu_idx = srcu_read_lock(&kvm->srcu);  in kvm_recover_nx_lpages()
5990 	spin_lock(&kvm->mmu_lock);  in kvm_recover_nx_lpages()
5993 	to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;  in kvm_recover_nx_lpages()
5994 	for ( ; to_zap; --to_zap) {  in kvm_recover_nx_lpages()
5995 		if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))  in kvm_recover_nx_lpages()
6003 		sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,  in kvm_recover_nx_lpages()
6006 		WARN_ON_ONCE(!sp->lpage_disallowed);  in kvm_recover_nx_lpages()
6007 		if (sp->tdp_mmu_page) {  in kvm_recover_nx_lpages()
6011 			WARN_ON_ONCE(sp->lpage_disallowed);  in kvm_recover_nx_lpages()
6014 		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {  in kvm_recover_nx_lpages()
6016 			cond_resched_lock(&kvm->mmu_lock);  in kvm_recover_nx_lpages()
6022 	spin_unlock(&kvm->mmu_lock);  in kvm_recover_nx_lpages()
6023 	srcu_read_unlock(&kvm->srcu, rcu_idx);  in kvm_recover_nx_lpages()
6029 		? start_time + 60 * HZ - get_jiffies_64()  in get_nx_lpage_recovery_timeout()
6063 					  "kvm-nx-lpage-recovery",  in kvm_mmu_post_init_vm()
6064 					  &kvm->arch.nx_lpage_recovery_thread);  in kvm_mmu_post_init_vm()
6066 		kthread_unpark(kvm->arch.nx_lpage_recovery_thread);  in kvm_mmu_post_init_vm()
6073 	if (kvm->arch.nx_lpage_recovery_thread)  in kvm_mmu_pre_destroy_vm()
6074 		kthread_stop(kvm->arch.nx_lpage_recovery_thread);  in kvm_mmu_pre_destroy_vm()