• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
kvm_mmu_init_tdp_mmu(struct kvm * kvm)17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return false;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 
29 	return true;
30 }
31 
kvm_lockdep_assert_mmu_lock_held(struct kvm * kvm,bool shared)32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 							     bool shared)
34 {
35 	if (shared)
36 		lockdep_assert_held_read(&kvm->mmu_lock);
37 	else
38 		lockdep_assert_held_write(&kvm->mmu_lock);
39 }
40 
kvm_mmu_uninit_tdp_mmu(struct kvm * kvm)41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42 {
43 	if (!kvm->arch.tdp_mmu_enabled)
44 		return;
45 
46 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
47 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
48 
49 	/*
50 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 	 * can run before the VM is torn down.
52 	 */
53 	rcu_barrier();
54 }
55 
56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
57 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
58 			  bool shared);
59 
tdp_mmu_free_sp(struct kvm_mmu_page * sp)60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
61 {
62 	free_page((unsigned long)sp->spt);
63 	kmem_cache_free(mmu_page_header_cache, sp);
64 }
65 
66 /*
67  * This is called through call_rcu in order to free TDP page table memory
68  * safely with respect to other kernel threads that may be operating on
69  * the memory.
70  * By only accessing TDP MMU page table memory in an RCU read critical
71  * section, and freeing it after a grace period, lockless access to that
72  * memory won't use it after it is freed.
73  */
tdp_mmu_free_sp_rcu_callback(struct rcu_head * head)74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
75 {
76 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
77 					       rcu_head);
78 
79 	tdp_mmu_free_sp(sp);
80 }
81 
kvm_tdp_mmu_put_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
83 			  bool shared)
84 {
85 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
86 
87 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
88 		return;
89 
90 	WARN_ON(!root->tdp_mmu_page);
91 
92 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 	list_del_rcu(&root->link);
94 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
95 
96 	zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
97 
98 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
99 }
100 
101 /*
102  * Returns the next root after @prev_root (or the first root if @prev_root is
103  * NULL).  A reference to the returned root is acquired, and the reference to
104  * @prev_root is released (the caller obviously must hold a reference to
105  * @prev_root if it's non-NULL).
106  *
107  * If @only_valid is true, invalid roots are skipped.
108  *
109  * Returns NULL if the end of tdp_mmu_roots was reached.
110  */
tdp_mmu_next_root(struct kvm * kvm,struct kvm_mmu_page * prev_root,bool shared,bool only_valid)111 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
112 					      struct kvm_mmu_page *prev_root,
113 					      bool shared, bool only_valid)
114 {
115 	struct kvm_mmu_page *next_root;
116 
117 	rcu_read_lock();
118 
119 	if (prev_root)
120 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
121 						  &prev_root->link,
122 						  typeof(*prev_root), link);
123 	else
124 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
125 						   typeof(*next_root), link);
126 
127 	while (next_root) {
128 		if ((!only_valid || !next_root->role.invalid) &&
129 		    kvm_tdp_mmu_get_root(kvm, next_root))
130 			break;
131 
132 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
133 				&next_root->link, typeof(*next_root), link);
134 	}
135 
136 	rcu_read_unlock();
137 
138 	if (prev_root)
139 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
140 
141 	return next_root;
142 }
143 
144 /*
145  * Note: this iterator gets and puts references to the roots it iterates over.
146  * This makes it safe to release the MMU lock and yield within the loop, but
147  * if exiting the loop early, the caller must drop the reference to the most
148  * recent root. (Unless keeping a live reference is desirable.)
149  *
150  * If shared is set, this function is operating under the MMU lock in read
151  * mode. In the unlikely event that this thread must free a root, the lock
152  * will be temporarily dropped and reacquired in write mode.
153  */
154 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
155 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
156 	     _root;								\
157 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
158 		if (kvm_mmu_page_as_id(_root) != _as_id) {			\
159 		} else
160 
161 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
162 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
163 
164 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)		\
165 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
166 
167 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
168 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
169 				lockdep_is_held_type(&kvm->mmu_lock, 0) ||	\
170 				lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock))	\
171 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
172 		} else
173 
page_role_for_level(struct kvm_vcpu * vcpu,int level)174 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
175 						   int level)
176 {
177 	union kvm_mmu_page_role role;
178 
179 	role = vcpu->arch.mmu->mmu_role.base;
180 	role.level = level;
181 	role.direct = true;
182 	role.gpte_is_8_bytes = true;
183 	role.access = ACC_ALL;
184 
185 	return role;
186 }
187 
alloc_tdp_mmu_page(struct kvm_vcpu * vcpu,gfn_t gfn,int level)188 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
189 					       int level)
190 {
191 	struct kvm_mmu_page *sp;
192 
193 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
194 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
195 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
196 
197 	sp->role.word = page_role_for_level(vcpu, level).word;
198 	sp->gfn = gfn;
199 	sp->tdp_mmu_page = true;
200 
201 	trace_kvm_mmu_get_page(sp, true);
202 
203 	return sp;
204 }
205 
kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu * vcpu)206 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
207 {
208 	union kvm_mmu_page_role role;
209 	struct kvm *kvm = vcpu->kvm;
210 	struct kvm_mmu_page *root;
211 
212 	lockdep_assert_held_write(&kvm->mmu_lock);
213 
214 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
215 
216 	/*
217 	 * Check for an existing root before allocating a new one.  Note, the
218 	 * role check prevents consuming an invalid root.
219 	 */
220 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
221 		if (root->role.word == role.word &&
222 		    kvm_tdp_mmu_get_root(kvm, root))
223 			goto out;
224 	}
225 
226 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
227 	refcount_set(&root->tdp_mmu_root_count, 1);
228 
229 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
230 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
231 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
232 
233 out:
234 	return __pa(root->spt);
235 }
236 
237 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
238 				u64 old_spte, u64 new_spte, int level,
239 				bool shared);
240 
handle_changed_spte_acc_track(u64 old_spte,u64 new_spte,int level)241 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
242 {
243 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
244 		return;
245 
246 	if (is_accessed_spte(old_spte) &&
247 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
248 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
249 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
250 }
251 
handle_changed_spte_dirty_log(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level)252 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
253 					  u64 old_spte, u64 new_spte, int level)
254 {
255 	bool pfn_changed;
256 	struct kvm_memory_slot *slot;
257 
258 	if (level > PG_LEVEL_4K)
259 		return;
260 
261 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
262 
263 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
264 	    is_writable_pte(new_spte)) {
265 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
266 		mark_page_dirty_in_slot(kvm, slot, gfn);
267 	}
268 }
269 
270 /**
271  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
272  *
273  * @kvm: kvm instance
274  * @sp: the new page
275  * @account_nx: This page replaces a NX large page and should be marked for
276  *		eventual reclaim.
277  */
tdp_mmu_link_page(struct kvm * kvm,struct kvm_mmu_page * sp,bool account_nx)278 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
279 			      bool account_nx)
280 {
281 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
282 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
283 	if (account_nx)
284 		account_huge_nx_page(kvm, sp);
285 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
286 }
287 
288 /**
289  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
290  *
291  * @kvm: kvm instance
292  * @sp: the page to be removed
293  * @shared: This operation may not be running under the exclusive use of
294  *	    the MMU lock and the operation must synchronize with other
295  *	    threads that might be adding or removing pages.
296  */
tdp_mmu_unlink_page(struct kvm * kvm,struct kvm_mmu_page * sp,bool shared)297 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
298 				bool shared)
299 {
300 	if (shared)
301 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
302 	else
303 		lockdep_assert_held_write(&kvm->mmu_lock);
304 
305 	list_del(&sp->link);
306 	if (sp->lpage_disallowed)
307 		unaccount_huge_nx_page(kvm, sp);
308 
309 	if (shared)
310 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
311 }
312 
313 /**
314  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
315  *
316  * @kvm: kvm instance
317  * @pt: the page removed from the paging structure
318  * @shared: This operation may not be running under the exclusive use
319  *	    of the MMU lock and the operation must synchronize with other
320  *	    threads that might be modifying SPTEs.
321  *
322  * Given a page table that has been removed from the TDP paging structure,
323  * iterates through the page table to clear SPTEs and free child page tables.
324  *
325  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
326  * protection. Since this thread removed it from the paging structure,
327  * this thread will be responsible for ensuring the page is freed. Hence the
328  * early rcu_dereferences in the function.
329  */
handle_removed_tdp_mmu_page(struct kvm * kvm,tdp_ptep_t pt,bool shared)330 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
331 					bool shared)
332 {
333 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
334 	int level = sp->role.level;
335 	gfn_t base_gfn = sp->gfn;
336 	int i;
337 
338 	trace_kvm_mmu_prepare_zap_page(sp);
339 
340 	tdp_mmu_unlink_page(kvm, sp, shared);
341 
342 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
343 		u64 *sptep = rcu_dereference(pt) + i;
344 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
345 		u64 old_child_spte;
346 
347 		if (shared) {
348 			/*
349 			 * Set the SPTE to a nonpresent value that other
350 			 * threads will not overwrite. If the SPTE was
351 			 * already marked as removed then another thread
352 			 * handling a page fault could overwrite it, so
353 			 * set the SPTE until it is set from some other
354 			 * value to the removed SPTE value.
355 			 */
356 			for (;;) {
357 				old_child_spte = xchg(sptep, REMOVED_SPTE);
358 				if (!is_removed_spte(old_child_spte))
359 					break;
360 				cpu_relax();
361 			}
362 		} else {
363 			/*
364 			 * If the SPTE is not MMU-present, there is no backing
365 			 * page associated with the SPTE and so no side effects
366 			 * that need to be recorded, and exclusive ownership of
367 			 * mmu_lock ensures the SPTE can't be made present.
368 			 * Note, zapping MMIO SPTEs is also unnecessary as they
369 			 * are guarded by the memslots generation, not by being
370 			 * unreachable.
371 			 */
372 			old_child_spte = READ_ONCE(*sptep);
373 			if (!is_shadow_present_pte(old_child_spte))
374 				continue;
375 
376 			/*
377 			 * Marking the SPTE as a removed SPTE is not
378 			 * strictly necessary here as the MMU lock will
379 			 * stop other threads from concurrently modifying
380 			 * this SPTE. Using the removed SPTE value keeps
381 			 * the two branches consistent and simplifies
382 			 * the function.
383 			 */
384 			WRITE_ONCE(*sptep, REMOVED_SPTE);
385 		}
386 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
387 				    old_child_spte, REMOVED_SPTE, level,
388 				    shared);
389 	}
390 
391 	kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
392 					   KVM_PAGES_PER_HPAGE(level + 1));
393 
394 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
395 }
396 
397 /**
398  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
399  * @kvm: kvm instance
400  * @as_id: the address space of the paging structure the SPTE was a part of
401  * @gfn: the base GFN that was mapped by the SPTE
402  * @old_spte: The value of the SPTE before the change
403  * @new_spte: The value of the SPTE after the change
404  * @level: the level of the PT the SPTE is part of in the paging structure
405  * @shared: This operation may not be running under the exclusive use of
406  *	    the MMU lock and the operation must synchronize with other
407  *	    threads that might be modifying SPTEs.
408  *
409  * Handle bookkeeping that might result from the modification of a SPTE.
410  * This function must be called for all TDP SPTE modifications.
411  */
__handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)412 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
413 				  u64 old_spte, u64 new_spte, int level,
414 				  bool shared)
415 {
416 	bool was_present = is_shadow_present_pte(old_spte);
417 	bool is_present = is_shadow_present_pte(new_spte);
418 	bool was_leaf = was_present && is_last_spte(old_spte, level);
419 	bool is_leaf = is_present && is_last_spte(new_spte, level);
420 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
421 
422 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
423 	WARN_ON(level < PG_LEVEL_4K);
424 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
425 
426 	/*
427 	 * If this warning were to trigger it would indicate that there was a
428 	 * missing MMU notifier or a race with some notifier handler.
429 	 * A present, leaf SPTE should never be directly replaced with another
430 	 * present leaf SPTE pointing to a different PFN. A notifier handler
431 	 * should be zapping the SPTE before the main MM's page table is
432 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
433 	 * thread before replacement.
434 	 */
435 	if (was_leaf && is_leaf && pfn_changed) {
436 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
437 		       "SPTE with another present leaf SPTE mapping a\n"
438 		       "different PFN!\n"
439 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
440 		       as_id, gfn, old_spte, new_spte, level);
441 
442 		/*
443 		 * Crash the host to prevent error propagation and guest data
444 		 * corruption.
445 		 */
446 		BUG();
447 	}
448 
449 	if (old_spte == new_spte)
450 		return;
451 
452 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
453 
454 	/*
455 	 * The only times a SPTE should be changed from a non-present to
456 	 * non-present state is when an MMIO entry is installed/modified/
457 	 * removed. In that case, there is nothing to do here.
458 	 */
459 	if (!was_present && !is_present) {
460 		/*
461 		 * If this change does not involve a MMIO SPTE or removed SPTE,
462 		 * it is unexpected. Log the change, though it should not
463 		 * impact the guest since both the former and current SPTEs
464 		 * are nonpresent.
465 		 */
466 		if (WARN_ON(!is_mmio_spte(old_spte) &&
467 			    !is_mmio_spte(new_spte) &&
468 			    !is_removed_spte(new_spte)))
469 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
470 			       "should not be replaced with another,\n"
471 			       "different nonpresent SPTE, unless one or both\n"
472 			       "are MMIO SPTEs, or the new SPTE is\n"
473 			       "a temporary removed SPTE.\n"
474 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
475 			       as_id, gfn, old_spte, new_spte, level);
476 		return;
477 	}
478 
479 	if (is_leaf != was_leaf)
480 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
481 
482 	if (was_leaf && is_dirty_spte(old_spte) &&
483 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
484 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
485 
486 	/*
487 	 * Recursively handle child PTs if the change removed a subtree from
488 	 * the paging structure.
489 	 */
490 	if (was_present && !was_leaf && (pfn_changed || !is_present))
491 		handle_removed_tdp_mmu_page(kvm,
492 				spte_to_child_pt(old_spte, level), shared);
493 }
494 
handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)495 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
496 				u64 old_spte, u64 new_spte, int level,
497 				bool shared)
498 {
499 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
500 			      shared);
501 	handle_changed_spte_acc_track(old_spte, new_spte, level);
502 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
503 				      new_spte, level);
504 }
505 
506 /*
507  * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
508  * and handle the associated bookkeeping, but do not mark the page dirty
509  * in KVM's dirty bitmaps.
510  *
511  * @kvm: kvm instance
512  * @iter: a tdp_iter instance currently on the SPTE that should be set
513  * @new_spte: The value the SPTE should be set to
514  * Returns: true if the SPTE was set, false if it was not. If false is returned,
515  *	    this function will have no side-effects.
516  */
tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)517 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
518 							struct tdp_iter *iter,
519 							u64 new_spte)
520 {
521 	WARN_ON_ONCE(iter->yielded);
522 
523 	lockdep_assert_held_read(&kvm->mmu_lock);
524 
525 	/*
526 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
527 	 * may modify it.
528 	 */
529 	if (is_removed_spte(iter->old_spte))
530 		return false;
531 
532 	/*
533 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
534 	 * does not hold the mmu_lock.
535 	 */
536 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
537 		      new_spte) != iter->old_spte)
538 		return false;
539 
540 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
541 			      new_spte, iter->level, true);
542 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
543 
544 	return true;
545 }
546 
547 /*
548  * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a
549  * TDP page fault.
550  *
551  * @vcpu: The vcpu instance that took the TDP page fault.
552  * @iter: a tdp_iter instance currently on the SPTE that should be set
553  * @new_spte: The value the SPTE should be set to
554  *
555  * Returns: true if the SPTE was set, false if it was not. If false is returned,
556  *	    this function will have no side-effects.
557  */
tdp_mmu_map_set_spte_atomic(struct kvm_vcpu * vcpu,struct tdp_iter * iter,u64 new_spte)558 static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu,
559 					       struct tdp_iter *iter,
560 					       u64 new_spte)
561 {
562 	struct kvm *kvm = vcpu->kvm;
563 
564 	if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
565 		return false;
566 
567 	/*
568 	 * Use kvm_vcpu_gfn_to_memslot() instead of going through
569 	 * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot.
570 	 */
571 	if (is_writable_pte(new_spte)) {
572 		struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn);
573 
574 		if (slot && kvm_slot_dirty_track_enabled(slot)) {
575 			/* Enforced by kvm_mmu_hugepage_adjust. */
576 			WARN_ON_ONCE(iter->level > PG_LEVEL_4K);
577 			mark_page_dirty_in_slot(kvm, slot, iter->gfn);
578 		}
579 	}
580 
581 	return true;
582 }
583 
tdp_mmu_zap_spte_atomic(struct kvm * kvm,struct tdp_iter * iter)584 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
585 					   struct tdp_iter *iter)
586 {
587 	/*
588 	 * Freeze the SPTE by setting it to a special,
589 	 * non-present value. This will stop other threads from
590 	 * immediately installing a present entry in its place
591 	 * before the TLBs are flushed.
592 	 */
593 	if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE))
594 		return false;
595 
596 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
597 					   KVM_PAGES_PER_HPAGE(iter->level));
598 
599 	/*
600 	 * No other thread can overwrite the removed SPTE as they
601 	 * must either wait on the MMU lock or use
602 	 * tdp_mmu_set_spte_atomic which will not overwrite the
603 	 * special removed SPTE value. No bookkeeping is needed
604 	 * here since the SPTE is going from non-present
605 	 * to non-present.
606 	 */
607 	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
608 
609 	return true;
610 }
611 
612 
613 /*
614  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
615  * @kvm: kvm instance
616  * @iter: a tdp_iter instance currently on the SPTE that should be set
617  * @new_spte: The value the SPTE should be set to
618  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
619  *		      of the page. Should be set unless handling an MMU
620  *		      notifier for access tracking. Leaving record_acc_track
621  *		      unset in that case prevents page accesses from being
622  *		      double counted.
623  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
624  *		      appropriate for the change being made. Should be set
625  *		      unless performing certain dirty logging operations.
626  *		      Leaving record_dirty_log unset in that case prevents page
627  *		      writes from being double counted.
628  */
__tdp_mmu_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte,bool record_acc_track,bool record_dirty_log)629 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
630 				      u64 new_spte, bool record_acc_track,
631 				      bool record_dirty_log)
632 {
633 	WARN_ON_ONCE(iter->yielded);
634 
635 	lockdep_assert_held_write(&kvm->mmu_lock);
636 
637 	/*
638 	 * No thread should be using this function to set SPTEs to the
639 	 * temporary removed SPTE value.
640 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
641 	 * should be used. If operating under the MMU lock in write mode, the
642 	 * use of the removed SPTE should not be necessary.
643 	 */
644 	WARN_ON(is_removed_spte(iter->old_spte));
645 
646 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
647 
648 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
649 			      new_spte, iter->level, false);
650 	if (record_acc_track)
651 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
652 					      iter->level);
653 	if (record_dirty_log)
654 		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
655 					      iter->old_spte, new_spte,
656 					      iter->level);
657 }
658 
tdp_mmu_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)659 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
660 				    u64 new_spte)
661 {
662 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
663 }
664 
tdp_mmu_set_spte_no_acc_track(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)665 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
666 						 struct tdp_iter *iter,
667 						 u64 new_spte)
668 {
669 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
670 }
671 
tdp_mmu_set_spte_no_dirty_log(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)672 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
673 						 struct tdp_iter *iter,
674 						 u64 new_spte)
675 {
676 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
677 }
678 
679 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
680 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
681 
682 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
683 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
684 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
685 		    !is_last_spte(_iter.old_spte, _iter.level))		\
686 			continue;					\
687 		else
688 
689 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
690 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
691 			 _mmu->shadow_root_level, _start, _end)
692 
693 /*
694  * Yield if the MMU lock is contended or this thread needs to return control
695  * to the scheduler.
696  *
697  * If this function should yield and flush is set, it will perform a remote
698  * TLB flush before yielding.
699  *
700  * If this function yields, iter->yielded is set and the caller must skip to
701  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
702  * over the paging structures to allow the iterator to continue its traversal
703  * from the paging structure root.
704  *
705  * Returns true if this function yielded.
706  */
tdp_mmu_iter_cond_resched(struct kvm * kvm,struct tdp_iter * iter,bool flush,bool shared)707 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
708 							  struct tdp_iter *iter,
709 							  bool flush, bool shared)
710 {
711 	WARN_ON(iter->yielded);
712 
713 	/* Ensure forward progress has been made before yielding. */
714 	if (iter->next_last_level_gfn == iter->yielded_gfn)
715 		return false;
716 
717 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
718 		rcu_read_unlock();
719 
720 		if (flush)
721 			kvm_flush_remote_tlbs(kvm);
722 
723 		if (shared)
724 			cond_resched_rwlock_read(&kvm->mmu_lock);
725 		else
726 			cond_resched_rwlock_write(&kvm->mmu_lock);
727 
728 		rcu_read_lock();
729 
730 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
731 
732 		iter->yielded = true;
733 	}
734 
735 	return iter->yielded;
736 }
737 
738 /*
739  * Tears down the mappings for the range of gfns, [start, end), and frees the
740  * non-root pages mapping GFNs strictly within that range. Returns true if
741  * SPTEs have been cleared and a TLB flush is needed before releasing the
742  * MMU lock.
743  *
744  * If can_yield is true, will release the MMU lock and reschedule if the
745  * scheduler needs the CPU or there is contention on the MMU lock. If this
746  * function cannot yield, it will not release the MMU lock or reschedule and
747  * the caller must ensure it does not supply too large a GFN range, or the
748  * operation can cause a soft lockup.
749  *
750  * If shared is true, this thread holds the MMU lock in read mode and must
751  * account for the possibility that other threads are modifying the paging
752  * structures concurrently. If shared is false, this thread should hold the
753  * MMU lock in write mode.
754  */
zap_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,bool can_yield,bool flush,bool shared)755 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
756 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
757 			  bool shared)
758 {
759 	gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
760 	bool zap_all = (start == 0 && end >= max_gfn_host);
761 	struct tdp_iter iter;
762 
763 	/*
764 	 * No need to try to step down in the iterator when zapping all SPTEs,
765 	 * zapping the top-level non-leaf SPTEs will recurse on their children.
766 	 */
767 	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
768 
769 	/*
770 	 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
771 	 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
772 	 * and so KVM will never install a SPTE for such addresses.
773 	 */
774 	end = min(end, max_gfn_host);
775 
776 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
777 
778 	rcu_read_lock();
779 
780 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
781 				   min_level, start, end) {
782 retry:
783 		if (can_yield &&
784 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
785 			flush = false;
786 			continue;
787 		}
788 
789 		if (!is_shadow_present_pte(iter.old_spte))
790 			continue;
791 
792 		/*
793 		 * If this is a non-last-level SPTE that covers a larger range
794 		 * than should be zapped, continue, and zap the mappings at a
795 		 * lower level, except when zapping all SPTEs.
796 		 */
797 		if (!zap_all &&
798 		    (iter.gfn < start ||
799 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
800 		    !is_last_spte(iter.old_spte, iter.level))
801 			continue;
802 
803 		if (!shared) {
804 			tdp_mmu_set_spte(kvm, &iter, 0);
805 			flush = true;
806 		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
807 			/*
808 			 * The iter must explicitly re-read the SPTE because
809 			 * the atomic cmpxchg failed.
810 			 */
811 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
812 			goto retry;
813 		}
814 	}
815 
816 	rcu_read_unlock();
817 	return flush;
818 }
819 
820 /*
821  * Tears down the mappings for the range of gfns, [start, end), and frees the
822  * non-root pages mapping GFNs strictly within that range. Returns true if
823  * SPTEs have been cleared and a TLB flush is needed before releasing the
824  * MMU lock.
825  */
__kvm_tdp_mmu_zap_gfn_range(struct kvm * kvm,int as_id,gfn_t start,gfn_t end,bool can_yield,bool flush)826 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
827 				 gfn_t end, bool can_yield, bool flush)
828 {
829 	struct kvm_mmu_page *root;
830 
831 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
832 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
833 				      false);
834 
835 	return flush;
836 }
837 
kvm_tdp_mmu_zap_all(struct kvm * kvm)838 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
839 {
840 	bool flush = false;
841 	int i;
842 
843 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
844 		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
845 
846 	if (flush)
847 		kvm_flush_remote_tlbs(kvm);
848 }
849 
next_invalidated_root(struct kvm * kvm,struct kvm_mmu_page * prev_root)850 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
851 						  struct kvm_mmu_page *prev_root)
852 {
853 	struct kvm_mmu_page *next_root;
854 
855 	if (prev_root)
856 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
857 						  &prev_root->link,
858 						  typeof(*prev_root), link);
859 	else
860 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
861 						   typeof(*next_root), link);
862 
863 	while (next_root && !(next_root->role.invalid &&
864 			      refcount_read(&next_root->tdp_mmu_root_count)))
865 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
866 						  &next_root->link,
867 						  typeof(*next_root), link);
868 
869 	return next_root;
870 }
871 
872 /*
873  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
874  * invalidated root, they will not be freed until this function drops the
875  * reference. Before dropping that reference, tear down the paging
876  * structure so that whichever thread does drop the last reference
877  * only has to do a trivial amount of work. Since the roots are invalid,
878  * no new SPTEs should be created under them.
879  */
kvm_tdp_mmu_zap_invalidated_roots(struct kvm * kvm)880 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
881 {
882 	struct kvm_mmu_page *next_root;
883 	struct kvm_mmu_page *root;
884 	bool flush = false;
885 
886 	lockdep_assert_held_read(&kvm->mmu_lock);
887 
888 	rcu_read_lock();
889 
890 	root = next_invalidated_root(kvm, NULL);
891 
892 	while (root) {
893 		next_root = next_invalidated_root(kvm, root);
894 
895 		rcu_read_unlock();
896 
897 		flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
898 
899 		/*
900 		 * Put the reference acquired in
901 		 * kvm_tdp_mmu_invalidate_roots
902 		 */
903 		kvm_tdp_mmu_put_root(kvm, root, true);
904 
905 		root = next_root;
906 
907 		rcu_read_lock();
908 	}
909 
910 	rcu_read_unlock();
911 
912 	if (flush)
913 		kvm_flush_remote_tlbs(kvm);
914 }
915 
916 /*
917  * Mark each TDP MMU root as invalid so that other threads
918  * will drop their references and allow the root count to
919  * go to 0.
920  *
921  * Also take a reference on all roots so that this thread
922  * can do the bulk of the work required to free the roots
923  * once they are invalidated. Without this reference, a
924  * vCPU thread might drop the last reference to a root and
925  * get stuck with tearing down the entire paging structure.
926  *
927  * Roots which have a zero refcount should be skipped as
928  * they're already being torn down.
929  * Already invalid roots should be referenced again so that
930  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
931  * done with them.
932  *
933  * This has essentially the same effect for the TDP MMU
934  * as updating mmu_valid_gen does for the shadow MMU.
935  */
kvm_tdp_mmu_invalidate_all_roots(struct kvm * kvm)936 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
937 {
938 	struct kvm_mmu_page *root;
939 
940 	lockdep_assert_held_write(&kvm->mmu_lock);
941 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
942 		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
943 			root->role.invalid = true;
944 }
945 
946 /*
947  * Installs a last-level SPTE to handle a TDP page fault.
948  * (NPT/EPT violation/misconfiguration)
949  */
tdp_mmu_map_handle_target_level(struct kvm_vcpu * vcpu,int write,int map_writable,struct tdp_iter * iter,kvm_pfn_t pfn,bool prefault)950 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
951 					  int map_writable,
952 					  struct tdp_iter *iter,
953 					  kvm_pfn_t pfn, bool prefault)
954 {
955 	u64 new_spte;
956 	int ret = RET_PF_FIXED;
957 	int make_spte_ret = 0;
958 
959 	if (unlikely(is_noslot_pfn(pfn)))
960 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
961 	else
962 		make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
963 					 pfn, iter->old_spte, prefault, true,
964 					 map_writable, !shadow_accessed_mask,
965 					 &new_spte);
966 
967 	if (new_spte == iter->old_spte)
968 		ret = RET_PF_SPURIOUS;
969 	else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte))
970 		return RET_PF_RETRY;
971 
972 	/*
973 	 * If the page fault was caused by a write but the page is write
974 	 * protected, emulation is needed. If the emulation was skipped,
975 	 * the vCPU would have the same fault again.
976 	 */
977 	if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
978 		if (write)
979 			ret = RET_PF_EMULATE;
980 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
981 	}
982 
983 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
984 	if (unlikely(is_mmio_spte(new_spte))) {
985 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
986 				     new_spte);
987 		ret = RET_PF_EMULATE;
988 	} else {
989 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
990 				       rcu_dereference(iter->sptep));
991 	}
992 
993 	/*
994 	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
995 	 * consistent with legacy MMU behavior.
996 	 */
997 	if (ret != RET_PF_SPURIOUS)
998 		vcpu->stat.pf_fixed++;
999 
1000 	return ret;
1001 }
1002 
1003 /*
1004  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1005  * page tables and SPTEs to translate the faulting guest physical address.
1006  */
kvm_tdp_mmu_map(struct kvm_vcpu * vcpu,gpa_t gpa,u32 error_code,int map_writable,int max_level,kvm_pfn_t pfn,bool prefault)1007 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
1008 		    int map_writable, int max_level, kvm_pfn_t pfn,
1009 		    bool prefault)
1010 {
1011 	bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
1012 	bool write = error_code & PFERR_WRITE_MASK;
1013 	bool exec = error_code & PFERR_FETCH_MASK;
1014 	bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
1015 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1016 	struct tdp_iter iter;
1017 	struct kvm_mmu_page *sp;
1018 	u64 *child_pt;
1019 	u64 new_spte;
1020 	int ret;
1021 	gfn_t gfn = gpa >> PAGE_SHIFT;
1022 	int level;
1023 	int req_level;
1024 
1025 	level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
1026 					huge_page_disallowed, &req_level);
1027 
1028 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
1029 
1030 	rcu_read_lock();
1031 
1032 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1033 		if (nx_huge_page_workaround_enabled)
1034 			disallowed_hugepage_adjust(iter.old_spte, gfn,
1035 						   iter.level, &pfn, &level);
1036 
1037 		if (iter.level == level)
1038 			break;
1039 
1040 		/*
1041 		 * If there is an SPTE mapping a large page at a higher level
1042 		 * than the target, that SPTE must be cleared and replaced
1043 		 * with a non-leaf SPTE.
1044 		 */
1045 		if (is_shadow_present_pte(iter.old_spte) &&
1046 		    is_large_pte(iter.old_spte)) {
1047 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1048 				break;
1049 
1050 			/*
1051 			 * The iter must explicitly re-read the spte here
1052 			 * because the new value informs the !present
1053 			 * path below.
1054 			 */
1055 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1056 		}
1057 
1058 		if (!is_shadow_present_pte(iter.old_spte)) {
1059 			/*
1060 			 * If SPTE has been frozen by another thread, just
1061 			 * give up and retry, avoiding unnecessary page table
1062 			 * allocation and free.
1063 			 */
1064 			if (is_removed_spte(iter.old_spte))
1065 				break;
1066 
1067 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1068 			child_pt = sp->spt;
1069 
1070 			new_spte = make_nonleaf_spte(child_pt,
1071 						     !shadow_accessed_mask);
1072 
1073 			if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) {
1074 				tdp_mmu_link_page(vcpu->kvm, sp,
1075 						  huge_page_disallowed &&
1076 						  req_level >= iter.level);
1077 
1078 				trace_kvm_mmu_get_page(sp, true);
1079 			} else {
1080 				tdp_mmu_free_sp(sp);
1081 				break;
1082 			}
1083 		}
1084 	}
1085 
1086 	if (iter.level != level) {
1087 		rcu_read_unlock();
1088 		return RET_PF_RETRY;
1089 	}
1090 
1091 	ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
1092 					      pfn, prefault);
1093 	rcu_read_unlock();
1094 
1095 	return ret;
1096 }
1097 
kvm_tdp_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool flush)1098 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1099 				 bool flush)
1100 {
1101 	return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
1102 					   range->end, range->may_block, flush);
1103 }
1104 
1105 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1106 			      struct kvm_gfn_range *range);
1107 
kvm_tdp_mmu_handle_gfn(struct kvm * kvm,struct kvm_gfn_range * range,tdp_handler_t handler)1108 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1109 						   struct kvm_gfn_range *range,
1110 						   tdp_handler_t handler)
1111 {
1112 	struct kvm_mmu_page *root;
1113 	struct tdp_iter iter;
1114 	bool ret = false;
1115 
1116 	rcu_read_lock();
1117 
1118 	/*
1119 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1120 	 * into this helper allow blocking; it'd be dead, wasteful code.
1121 	 */
1122 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1123 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1124 			ret |= handler(kvm, &iter, range);
1125 	}
1126 
1127 	rcu_read_unlock();
1128 
1129 	return ret;
1130 }
1131 
1132 /*
1133  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1134  * if any of the GFNs in the range have been accessed.
1135  */
age_gfn_range(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1136 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1137 			  struct kvm_gfn_range *range)
1138 {
1139 	u64 new_spte = 0;
1140 
1141 	/* If we have a non-accessed entry we don't need to change the pte. */
1142 	if (!is_accessed_spte(iter->old_spte))
1143 		return false;
1144 
1145 	new_spte = iter->old_spte;
1146 
1147 	if (spte_ad_enabled(new_spte)) {
1148 		new_spte &= ~shadow_accessed_mask;
1149 	} else {
1150 		/*
1151 		 * Capture the dirty status of the page, so that it doesn't get
1152 		 * lost when the SPTE is marked for access tracking.
1153 		 */
1154 		if (is_writable_pte(new_spte))
1155 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1156 
1157 		new_spte = mark_spte_for_access_track(new_spte);
1158 	}
1159 
1160 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1161 
1162 	return true;
1163 }
1164 
kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)1165 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1166 {
1167 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1168 }
1169 
test_age_gfn(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1170 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1171 			 struct kvm_gfn_range *range)
1172 {
1173 	return is_accessed_spte(iter->old_spte);
1174 }
1175 
kvm_tdp_mmu_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1176 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1177 {
1178 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1179 }
1180 
set_spte_gfn(struct kvm * kvm,struct tdp_iter * iter,struct kvm_gfn_range * range)1181 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1182 			 struct kvm_gfn_range *range)
1183 {
1184 	u64 new_spte;
1185 
1186 	/* Huge pages aren't expected to be modified without first being zapped. */
1187 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1188 
1189 	if (iter->level != PG_LEVEL_4K ||
1190 	    !is_shadow_present_pte(iter->old_spte))
1191 		return false;
1192 
1193 	/*
1194 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1195 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1196 	 * invariant that the PFN of a present * leaf SPTE can never change.
1197 	 * See __handle_changed_spte().
1198 	 */
1199 	tdp_mmu_set_spte(kvm, iter, 0);
1200 
1201 	if (!pte_write(range->pte)) {
1202 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1203 								  pte_pfn(range->pte));
1204 
1205 		tdp_mmu_set_spte(kvm, iter, new_spte);
1206 	}
1207 
1208 	return true;
1209 }
1210 
1211 /*
1212  * Handle the changed_pte MMU notifier for the TDP MMU.
1213  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1214  * notifier.
1215  * Returns non-zero if a flush is needed before releasing the MMU lock.
1216  */
kvm_tdp_mmu_set_spte_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1217 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1218 {
1219 	bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1220 
1221 	/* FIXME: return 'flush' instead of flushing here. */
1222 	if (flush)
1223 		kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1224 
1225 	return false;
1226 }
1227 
1228 /*
1229  * Remove write access from all SPTEs at or above min_level that map GFNs
1230  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1231  * be flushed.
1232  */
wrprot_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int min_level)1233 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1234 			     gfn_t start, gfn_t end, int min_level)
1235 {
1236 	struct tdp_iter iter;
1237 	u64 new_spte;
1238 	bool spte_set = false;
1239 
1240 	rcu_read_lock();
1241 
1242 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1243 
1244 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1245 				   min_level, start, end) {
1246 retry:
1247 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1248 			continue;
1249 
1250 		if (!is_shadow_present_pte(iter.old_spte) ||
1251 		    !is_last_spte(iter.old_spte, iter.level) ||
1252 		    !(iter.old_spte & PT_WRITABLE_MASK))
1253 			continue;
1254 
1255 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1256 
1257 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1258 							  new_spte)) {
1259 			/*
1260 			 * The iter must explicitly re-read the SPTE because
1261 			 * the atomic cmpxchg failed.
1262 			 */
1263 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1264 			goto retry;
1265 		}
1266 		spte_set = true;
1267 	}
1268 
1269 	rcu_read_unlock();
1270 	return spte_set;
1271 }
1272 
1273 /*
1274  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1275  * only affect leaf SPTEs down to min_level.
1276  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1277  */
kvm_tdp_mmu_wrprot_slot(struct kvm * kvm,const struct kvm_memory_slot * slot,int min_level)1278 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1279 			     const struct kvm_memory_slot *slot, int min_level)
1280 {
1281 	struct kvm_mmu_page *root;
1282 	bool spte_set = false;
1283 
1284 	lockdep_assert_held_read(&kvm->mmu_lock);
1285 
1286 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1287 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1288 			     slot->base_gfn + slot->npages, min_level);
1289 
1290 	return spte_set;
1291 }
1292 
1293 /*
1294  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1295  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1296  * If AD bits are not enabled, this will require clearing the writable bit on
1297  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1298  * be flushed.
1299  */
clear_dirty_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end)1300 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1301 			   gfn_t start, gfn_t end)
1302 {
1303 	struct tdp_iter iter;
1304 	u64 new_spte;
1305 	bool spte_set = false;
1306 
1307 	rcu_read_lock();
1308 
1309 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1310 retry:
1311 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1312 			continue;
1313 
1314 		if (!is_shadow_present_pte(iter.old_spte))
1315 			continue;
1316 
1317 		if (spte_ad_need_write_protect(iter.old_spte)) {
1318 			if (is_writable_pte(iter.old_spte))
1319 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1320 			else
1321 				continue;
1322 		} else {
1323 			if (iter.old_spte & shadow_dirty_mask)
1324 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1325 			else
1326 				continue;
1327 		}
1328 
1329 		if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1330 							  new_spte)) {
1331 			/*
1332 			 * The iter must explicitly re-read the SPTE because
1333 			 * the atomic cmpxchg failed.
1334 			 */
1335 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1336 			goto retry;
1337 		}
1338 		spte_set = true;
1339 	}
1340 
1341 	rcu_read_unlock();
1342 	return spte_set;
1343 }
1344 
1345 /*
1346  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1347  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1348  * If AD bits are not enabled, this will require clearing the writable bit on
1349  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1350  * be flushed.
1351  */
kvm_tdp_mmu_clear_dirty_slot(struct kvm * kvm,const struct kvm_memory_slot * slot)1352 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1353 				  const struct kvm_memory_slot *slot)
1354 {
1355 	struct kvm_mmu_page *root;
1356 	bool spte_set = false;
1357 
1358 	lockdep_assert_held_read(&kvm->mmu_lock);
1359 
1360 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1361 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1362 				slot->base_gfn + slot->npages);
1363 
1364 	return spte_set;
1365 }
1366 
1367 /*
1368  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1369  * set in mask, starting at gfn. The given memslot is expected to contain all
1370  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1371  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1372  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1373  */
clear_dirty_pt_masked(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,unsigned long mask,bool wrprot)1374 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1375 				  gfn_t gfn, unsigned long mask, bool wrprot)
1376 {
1377 	struct tdp_iter iter;
1378 	u64 new_spte;
1379 
1380 	rcu_read_lock();
1381 
1382 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1383 				    gfn + BITS_PER_LONG) {
1384 		if (!mask)
1385 			break;
1386 
1387 		if (iter.level > PG_LEVEL_4K ||
1388 		    !(mask & (1UL << (iter.gfn - gfn))))
1389 			continue;
1390 
1391 		mask &= ~(1UL << (iter.gfn - gfn));
1392 
1393 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1394 			if (is_writable_pte(iter.old_spte))
1395 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1396 			else
1397 				continue;
1398 		} else {
1399 			if (iter.old_spte & shadow_dirty_mask)
1400 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1401 			else
1402 				continue;
1403 		}
1404 
1405 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1406 	}
1407 
1408 	rcu_read_unlock();
1409 }
1410 
1411 /*
1412  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1413  * set in mask, starting at gfn. The given memslot is expected to contain all
1414  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1415  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1416  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1417  */
kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,unsigned long mask,bool wrprot)1418 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1419 				       struct kvm_memory_slot *slot,
1420 				       gfn_t gfn, unsigned long mask,
1421 				       bool wrprot)
1422 {
1423 	struct kvm_mmu_page *root;
1424 
1425 	lockdep_assert_held_write(&kvm->mmu_lock);
1426 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1427 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1428 }
1429 
1430 /*
1431  * Clear leaf entries which could be replaced by large mappings, for
1432  * GFNs within the slot.
1433  */
zap_collapsible_spte_range(struct kvm * kvm,struct kvm_mmu_page * root,const struct kvm_memory_slot * slot)1434 static void zap_collapsible_spte_range(struct kvm *kvm,
1435 				       struct kvm_mmu_page *root,
1436 				       const struct kvm_memory_slot *slot)
1437 {
1438 	gfn_t start = slot->base_gfn;
1439 	gfn_t end = start + slot->npages;
1440 	struct tdp_iter iter;
1441 	kvm_pfn_t pfn;
1442 
1443 	rcu_read_lock();
1444 
1445 	tdp_root_for_each_pte(iter, root, start, end) {
1446 retry:
1447 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1448 			continue;
1449 
1450 		if (!is_shadow_present_pte(iter.old_spte) ||
1451 		    !is_last_spte(iter.old_spte, iter.level))
1452 			continue;
1453 
1454 		pfn = spte_to_pfn(iter.old_spte);
1455 		if (kvm_is_reserved_pfn(pfn) ||
1456 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1457 							    pfn, PG_LEVEL_NUM))
1458 			continue;
1459 
1460 		/* Note, a successful atomic zap also does a remote TLB flush. */
1461 		if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1462 			/*
1463 			 * The iter must explicitly re-read the SPTE because
1464 			 * the atomic cmpxchg failed.
1465 			 */
1466 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1467 			goto retry;
1468 		}
1469 	}
1470 
1471 	rcu_read_unlock();
1472 }
1473 
1474 /*
1475  * Clear non-leaf entries (and free associated page tables) which could
1476  * be replaced by large mappings, for GFNs within the slot.
1477  */
kvm_tdp_mmu_zap_collapsible_sptes(struct kvm * kvm,const struct kvm_memory_slot * slot)1478 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1479 				       const struct kvm_memory_slot *slot)
1480 {
1481 	struct kvm_mmu_page *root;
1482 
1483 	lockdep_assert_held_read(&kvm->mmu_lock);
1484 
1485 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1486 		zap_collapsible_spte_range(kvm, root, slot);
1487 }
1488 
1489 /*
1490  * Removes write access on the last level SPTE mapping this GFN and unsets the
1491  * MMU-writable bit to ensure future writes continue to be intercepted.
1492  * Returns true if an SPTE was set and a TLB flush is needed.
1493  */
write_protect_gfn(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,int min_level)1494 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1495 			      gfn_t gfn, int min_level)
1496 {
1497 	struct tdp_iter iter;
1498 	u64 new_spte;
1499 	bool spte_set = false;
1500 
1501 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1502 
1503 	rcu_read_lock();
1504 
1505 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1506 				   min_level, gfn, gfn + 1) {
1507 		if (!is_shadow_present_pte(iter.old_spte) ||
1508 		    !is_last_spte(iter.old_spte, iter.level))
1509 			continue;
1510 
1511 		new_spte = iter.old_spte &
1512 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1513 
1514 		if (new_spte == iter.old_spte)
1515 			break;
1516 
1517 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1518 		spte_set = true;
1519 	}
1520 
1521 	rcu_read_unlock();
1522 
1523 	return spte_set;
1524 }
1525 
1526 /*
1527  * Removes write access on the last level SPTE mapping this GFN and unsets the
1528  * MMU-writable bit to ensure future writes continue to be intercepted.
1529  * Returns true if an SPTE was set and a TLB flush is needed.
1530  */
kvm_tdp_mmu_write_protect_gfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,int min_level)1531 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1532 				   struct kvm_memory_slot *slot, gfn_t gfn,
1533 				   int min_level)
1534 {
1535 	struct kvm_mmu_page *root;
1536 	bool spte_set = false;
1537 
1538 	lockdep_assert_held_write(&kvm->mmu_lock);
1539 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1540 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1541 
1542 	return spte_set;
1543 }
1544 
1545 /*
1546  * Return the level of the lowest level SPTE added to sptes.
1547  * That SPTE may be non-present.
1548  *
1549  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1550  */
kvm_tdp_mmu_get_walk(struct kvm_vcpu * vcpu,u64 addr,u64 * sptes,int * root_level)1551 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1552 			 int *root_level)
1553 {
1554 	struct tdp_iter iter;
1555 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1556 	gfn_t gfn = addr >> PAGE_SHIFT;
1557 	int leaf = -1;
1558 
1559 	*root_level = vcpu->arch.mmu->shadow_root_level;
1560 
1561 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1562 		leaf = iter.level;
1563 		sptes[leaf] = iter.old_spte;
1564 	}
1565 
1566 	return leaf;
1567 }
1568 
1569 /*
1570  * Returns the last level spte pointer of the shadow page walk for the given
1571  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1572  * walk could be performed, returns NULL and *spte does not contain valid data.
1573  *
1574  * Contract:
1575  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1576  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1577  *
1578  * WARNING: This function is only intended to be called during fast_page_fault.
1579  */
kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu * vcpu,u64 addr,u64 * spte)1580 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1581 					u64 *spte)
1582 {
1583 	struct tdp_iter iter;
1584 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1585 	gfn_t gfn = addr >> PAGE_SHIFT;
1586 	tdp_ptep_t sptep = NULL;
1587 
1588 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1589 		*spte = iter.old_spte;
1590 		sptep = iter.sptep;
1591 	}
1592 
1593 	/*
1594 	 * Perform the rcu_dereference to get the raw spte pointer value since
1595 	 * we are passing it up to fast_page_fault, which is shared with the
1596 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1597 	 * annotation.
1598 	 *
1599 	 * This is safe since fast_page_fault obeys the contracts of this
1600 	 * function as well as all TDP MMU contracts around modifying SPTEs
1601 	 * outside of mmu_lock.
1602 	 */
1603 	return rcu_dereference(sptep);
1604 }
1605