• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4   * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5   */
6  
7  #include <linux/mman.h>
8  #include <linux/kvm_host.h>
9  #include <linux/io.h>
10  #include <linux/hugetlb.h>
11  #include <linux/sched/signal.h>
12  #include <trace/events/kvm.h>
13  #include <asm/pgalloc.h>
14  #include <asm/cacheflush.h>
15  #include <asm/kvm_arm.h>
16  #include <asm/kvm_mmu.h>
17  #include <asm/kvm_mmio.h>
18  #include <asm/kvm_ras.h>
19  #include <asm/kvm_asm.h>
20  #include <asm/kvm_emulate.h>
21  #include <asm/virt.h>
22  
23  #include "trace.h"
24  
25  static pgd_t *boot_hyp_pgd;
26  static pgd_t *hyp_pgd;
27  static pgd_t *merged_hyp_pgd;
28  static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
29  
30  static unsigned long hyp_idmap_start;
31  static unsigned long hyp_idmap_end;
32  static phys_addr_t hyp_idmap_vector;
33  
34  static unsigned long io_map_base;
35  
36  #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
37  
38  #define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
39  #define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
40  
is_iomap(unsigned long flags)41  static bool is_iomap(unsigned long flags)
42  {
43  	return flags & KVM_S2PTE_FLAG_IS_IOMAP;
44  }
45  
memslot_is_logging(struct kvm_memory_slot * memslot)46  static bool memslot_is_logging(struct kvm_memory_slot *memslot)
47  {
48  	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
49  }
50  
51  /**
52   * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
53   * @kvm:	pointer to kvm structure.
54   *
55   * Interface to HYP function to flush all VM TLB entries
56   */
kvm_flush_remote_tlbs(struct kvm * kvm)57  void kvm_flush_remote_tlbs(struct kvm *kvm)
58  {
59  	kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
60  }
61  
kvm_tlb_flush_vmid_ipa(struct kvm * kvm,phys_addr_t ipa)62  static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
63  {
64  	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
65  }
66  
67  /*
68   * D-Cache management functions. They take the page table entries by
69   * value, as they are flushing the cache using the kernel mapping (or
70   * kmap on 32bit).
71   */
kvm_flush_dcache_pte(pte_t pte)72  static void kvm_flush_dcache_pte(pte_t pte)
73  {
74  	__kvm_flush_dcache_pte(pte);
75  }
76  
kvm_flush_dcache_pmd(pmd_t pmd)77  static void kvm_flush_dcache_pmd(pmd_t pmd)
78  {
79  	__kvm_flush_dcache_pmd(pmd);
80  }
81  
kvm_flush_dcache_pud(pud_t pud)82  static void kvm_flush_dcache_pud(pud_t pud)
83  {
84  	__kvm_flush_dcache_pud(pud);
85  }
86  
kvm_is_device_pfn(unsigned long pfn)87  static bool kvm_is_device_pfn(unsigned long pfn)
88  {
89  	return !pfn_valid(pfn);
90  }
91  
92  /**
93   * stage2_dissolve_pmd() - clear and flush huge PMD entry
94   * @kvm:	pointer to kvm structure.
95   * @addr:	IPA
96   * @pmd:	pmd pointer for IPA
97   *
98   * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
99   */
stage2_dissolve_pmd(struct kvm * kvm,phys_addr_t addr,pmd_t * pmd)100  static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
101  {
102  	if (!pmd_thp_or_huge(*pmd))
103  		return;
104  
105  	pmd_clear(pmd);
106  	kvm_tlb_flush_vmid_ipa(kvm, addr);
107  	put_page(virt_to_page(pmd));
108  }
109  
110  /**
111   * stage2_dissolve_pud() - clear and flush huge PUD entry
112   * @kvm:	pointer to kvm structure.
113   * @addr:	IPA
114   * @pud:	pud pointer for IPA
115   *
116   * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
117   */
stage2_dissolve_pud(struct kvm * kvm,phys_addr_t addr,pud_t * pudp)118  static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
119  {
120  	if (!stage2_pud_huge(kvm, *pudp))
121  		return;
122  
123  	stage2_pud_clear(kvm, pudp);
124  	kvm_tlb_flush_vmid_ipa(kvm, addr);
125  	put_page(virt_to_page(pudp));
126  }
127  
mmu_topup_memory_cache(struct kvm_mmu_memory_cache * cache,int min,int max)128  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
129  				  int min, int max)
130  {
131  	void *page;
132  
133  	BUG_ON(max > KVM_NR_MEM_OBJS);
134  	if (cache->nobjs >= min)
135  		return 0;
136  	while (cache->nobjs < max) {
137  		page = (void *)__get_free_page(GFP_PGTABLE_USER);
138  		if (!page)
139  			return -ENOMEM;
140  		cache->objects[cache->nobjs++] = page;
141  	}
142  	return 0;
143  }
144  
mmu_free_memory_cache(struct kvm_mmu_memory_cache * mc)145  static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
146  {
147  	while (mc->nobjs)
148  		free_page((unsigned long)mc->objects[--mc->nobjs]);
149  }
150  
mmu_memory_cache_alloc(struct kvm_mmu_memory_cache * mc)151  static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
152  {
153  	void *p;
154  
155  	BUG_ON(!mc || !mc->nobjs);
156  	p = mc->objects[--mc->nobjs];
157  	return p;
158  }
159  
clear_stage2_pgd_entry(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr)160  static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
161  {
162  	pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
163  	stage2_pgd_clear(kvm, pgd);
164  	kvm_tlb_flush_vmid_ipa(kvm, addr);
165  	stage2_pud_free(kvm, pud_table);
166  	put_page(virt_to_page(pgd));
167  }
168  
clear_stage2_pud_entry(struct kvm * kvm,pud_t * pud,phys_addr_t addr)169  static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
170  {
171  	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
172  	VM_BUG_ON(stage2_pud_huge(kvm, *pud));
173  	stage2_pud_clear(kvm, pud);
174  	kvm_tlb_flush_vmid_ipa(kvm, addr);
175  	stage2_pmd_free(kvm, pmd_table);
176  	put_page(virt_to_page(pud));
177  }
178  
clear_stage2_pmd_entry(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr)179  static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
180  {
181  	pte_t *pte_table = pte_offset_kernel(pmd, 0);
182  	VM_BUG_ON(pmd_thp_or_huge(*pmd));
183  	pmd_clear(pmd);
184  	kvm_tlb_flush_vmid_ipa(kvm, addr);
185  	free_page((unsigned long)pte_table);
186  	put_page(virt_to_page(pmd));
187  }
188  
kvm_set_pte(pte_t * ptep,pte_t new_pte)189  static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
190  {
191  	WRITE_ONCE(*ptep, new_pte);
192  	dsb(ishst);
193  }
194  
kvm_set_pmd(pmd_t * pmdp,pmd_t new_pmd)195  static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
196  {
197  	WRITE_ONCE(*pmdp, new_pmd);
198  	dsb(ishst);
199  }
200  
kvm_pmd_populate(pmd_t * pmdp,pte_t * ptep)201  static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
202  {
203  	kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
204  }
205  
kvm_pud_populate(pud_t * pudp,pmd_t * pmdp)206  static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
207  {
208  	WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
209  	dsb(ishst);
210  }
211  
kvm_pgd_populate(pgd_t * pgdp,pud_t * pudp)212  static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp)
213  {
214  	WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp));
215  	dsb(ishst);
216  }
217  
218  /*
219   * Unmapping vs dcache management:
220   *
221   * If a guest maps certain memory pages as uncached, all writes will
222   * bypass the data cache and go directly to RAM.  However, the CPUs
223   * can still speculate reads (not writes) and fill cache lines with
224   * data.
225   *
226   * Those cache lines will be *clean* cache lines though, so a
227   * clean+invalidate operation is equivalent to an invalidate
228   * operation, because no cache lines are marked dirty.
229   *
230   * Those clean cache lines could be filled prior to an uncached write
231   * by the guest, and the cache coherent IO subsystem would therefore
232   * end up writing old data to disk.
233   *
234   * This is why right after unmapping a page/section and invalidating
235   * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
236   * the IO subsystem will never hit in the cache.
237   *
238   * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
239   * we then fully enforce cacheability of RAM, no matter what the guest
240   * does.
241   */
unmap_stage2_ptes(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr,phys_addr_t end)242  static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
243  		       phys_addr_t addr, phys_addr_t end)
244  {
245  	phys_addr_t start_addr = addr;
246  	pte_t *pte, *start_pte;
247  
248  	start_pte = pte = pte_offset_kernel(pmd, addr);
249  	do {
250  		if (!pte_none(*pte)) {
251  			pte_t old_pte = *pte;
252  
253  			kvm_set_pte(pte, __pte(0));
254  			kvm_tlb_flush_vmid_ipa(kvm, addr);
255  
256  			/* No need to invalidate the cache for device mappings */
257  			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
258  				kvm_flush_dcache_pte(old_pte);
259  
260  			put_page(virt_to_page(pte));
261  		}
262  	} while (pte++, addr += PAGE_SIZE, addr != end);
263  
264  	if (stage2_pte_table_empty(kvm, start_pte))
265  		clear_stage2_pmd_entry(kvm, pmd, start_addr);
266  }
267  
unmap_stage2_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)268  static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
269  		       phys_addr_t addr, phys_addr_t end)
270  {
271  	phys_addr_t next, start_addr = addr;
272  	pmd_t *pmd, *start_pmd;
273  
274  	start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
275  	do {
276  		next = stage2_pmd_addr_end(kvm, addr, end);
277  		if (!pmd_none(*pmd)) {
278  			if (pmd_thp_or_huge(*pmd)) {
279  				pmd_t old_pmd = *pmd;
280  
281  				pmd_clear(pmd);
282  				kvm_tlb_flush_vmid_ipa(kvm, addr);
283  
284  				kvm_flush_dcache_pmd(old_pmd);
285  
286  				put_page(virt_to_page(pmd));
287  			} else {
288  				unmap_stage2_ptes(kvm, pmd, addr, next);
289  			}
290  		}
291  	} while (pmd++, addr = next, addr != end);
292  
293  	if (stage2_pmd_table_empty(kvm, start_pmd))
294  		clear_stage2_pud_entry(kvm, pud, start_addr);
295  }
296  
unmap_stage2_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)297  static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
298  		       phys_addr_t addr, phys_addr_t end)
299  {
300  	phys_addr_t next, start_addr = addr;
301  	pud_t *pud, *start_pud;
302  
303  	start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
304  	do {
305  		next = stage2_pud_addr_end(kvm, addr, end);
306  		if (!stage2_pud_none(kvm, *pud)) {
307  			if (stage2_pud_huge(kvm, *pud)) {
308  				pud_t old_pud = *pud;
309  
310  				stage2_pud_clear(kvm, pud);
311  				kvm_tlb_flush_vmid_ipa(kvm, addr);
312  				kvm_flush_dcache_pud(old_pud);
313  				put_page(virt_to_page(pud));
314  			} else {
315  				unmap_stage2_pmds(kvm, pud, addr, next);
316  			}
317  		}
318  	} while (pud++, addr = next, addr != end);
319  
320  	if (stage2_pud_table_empty(kvm, start_pud))
321  		clear_stage2_pgd_entry(kvm, pgd, start_addr);
322  }
323  
324  /**
325   * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
326   * @kvm:   The VM pointer
327   * @start: The intermediate physical base address of the range to unmap
328   * @size:  The size of the area to unmap
329   *
330   * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
331   * be called while holding mmu_lock (unless for freeing the stage2 pgd before
332   * destroying the VM), otherwise another faulting VCPU may come in and mess
333   * with things behind our backs.
334   */
unmap_stage2_range(struct kvm * kvm,phys_addr_t start,u64 size)335  static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
336  {
337  	pgd_t *pgd;
338  	phys_addr_t addr = start, end = start + size;
339  	phys_addr_t next;
340  
341  	assert_spin_locked(&kvm->mmu_lock);
342  	WARN_ON(size & ~PAGE_MASK);
343  
344  	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
345  	do {
346  		/*
347  		 * Make sure the page table is still active, as another thread
348  		 * could have possibly freed the page table, while we released
349  		 * the lock.
350  		 */
351  		if (!READ_ONCE(kvm->arch.pgd))
352  			break;
353  		next = stage2_pgd_addr_end(kvm, addr, end);
354  		if (!stage2_pgd_none(kvm, *pgd))
355  			unmap_stage2_puds(kvm, pgd, addr, next);
356  		/*
357  		 * If the range is too large, release the kvm->mmu_lock
358  		 * to prevent starvation and lockup detector warnings.
359  		 */
360  		if (next != end)
361  			cond_resched_lock(&kvm->mmu_lock);
362  	} while (pgd++, addr = next, addr != end);
363  }
364  
stage2_flush_ptes(struct kvm * kvm,pmd_t * pmd,phys_addr_t addr,phys_addr_t end)365  static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
366  			      phys_addr_t addr, phys_addr_t end)
367  {
368  	pte_t *pte;
369  
370  	pte = pte_offset_kernel(pmd, addr);
371  	do {
372  		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
373  			kvm_flush_dcache_pte(*pte);
374  	} while (pte++, addr += PAGE_SIZE, addr != end);
375  }
376  
stage2_flush_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)377  static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
378  			      phys_addr_t addr, phys_addr_t end)
379  {
380  	pmd_t *pmd;
381  	phys_addr_t next;
382  
383  	pmd = stage2_pmd_offset(kvm, pud, addr);
384  	do {
385  		next = stage2_pmd_addr_end(kvm, addr, end);
386  		if (!pmd_none(*pmd)) {
387  			if (pmd_thp_or_huge(*pmd))
388  				kvm_flush_dcache_pmd(*pmd);
389  			else
390  				stage2_flush_ptes(kvm, pmd, addr, next);
391  		}
392  	} while (pmd++, addr = next, addr != end);
393  }
394  
stage2_flush_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)395  static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
396  			      phys_addr_t addr, phys_addr_t end)
397  {
398  	pud_t *pud;
399  	phys_addr_t next;
400  
401  	pud = stage2_pud_offset(kvm, pgd, addr);
402  	do {
403  		next = stage2_pud_addr_end(kvm, addr, end);
404  		if (!stage2_pud_none(kvm, *pud)) {
405  			if (stage2_pud_huge(kvm, *pud))
406  				kvm_flush_dcache_pud(*pud);
407  			else
408  				stage2_flush_pmds(kvm, pud, addr, next);
409  		}
410  	} while (pud++, addr = next, addr != end);
411  }
412  
stage2_flush_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)413  static void stage2_flush_memslot(struct kvm *kvm,
414  				 struct kvm_memory_slot *memslot)
415  {
416  	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
417  	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
418  	phys_addr_t next;
419  	pgd_t *pgd;
420  
421  	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
422  	do {
423  		next = stage2_pgd_addr_end(kvm, addr, end);
424  		if (!stage2_pgd_none(kvm, *pgd))
425  			stage2_flush_puds(kvm, pgd, addr, next);
426  	} while (pgd++, addr = next, addr != end);
427  }
428  
429  /**
430   * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
431   * @kvm: The struct kvm pointer
432   *
433   * Go through the stage 2 page tables and invalidate any cache lines
434   * backing memory already mapped to the VM.
435   */
stage2_flush_vm(struct kvm * kvm)436  static void stage2_flush_vm(struct kvm *kvm)
437  {
438  	struct kvm_memslots *slots;
439  	struct kvm_memory_slot *memslot;
440  	int idx;
441  
442  	idx = srcu_read_lock(&kvm->srcu);
443  	spin_lock(&kvm->mmu_lock);
444  
445  	slots = kvm_memslots(kvm);
446  	kvm_for_each_memslot(memslot, slots)
447  		stage2_flush_memslot(kvm, memslot);
448  
449  	spin_unlock(&kvm->mmu_lock);
450  	srcu_read_unlock(&kvm->srcu, idx);
451  }
452  
clear_hyp_pgd_entry(pgd_t * pgd)453  static void clear_hyp_pgd_entry(pgd_t *pgd)
454  {
455  	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
456  	pgd_clear(pgd);
457  	pud_free(NULL, pud_table);
458  	put_page(virt_to_page(pgd));
459  }
460  
clear_hyp_pud_entry(pud_t * pud)461  static void clear_hyp_pud_entry(pud_t *pud)
462  {
463  	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
464  	VM_BUG_ON(pud_huge(*pud));
465  	pud_clear(pud);
466  	pmd_free(NULL, pmd_table);
467  	put_page(virt_to_page(pud));
468  }
469  
clear_hyp_pmd_entry(pmd_t * pmd)470  static void clear_hyp_pmd_entry(pmd_t *pmd)
471  {
472  	pte_t *pte_table = pte_offset_kernel(pmd, 0);
473  	VM_BUG_ON(pmd_thp_or_huge(*pmd));
474  	pmd_clear(pmd);
475  	pte_free_kernel(NULL, pte_table);
476  	put_page(virt_to_page(pmd));
477  }
478  
unmap_hyp_ptes(pmd_t * pmd,phys_addr_t addr,phys_addr_t end)479  static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
480  {
481  	pte_t *pte, *start_pte;
482  
483  	start_pte = pte = pte_offset_kernel(pmd, addr);
484  	do {
485  		if (!pte_none(*pte)) {
486  			kvm_set_pte(pte, __pte(0));
487  			put_page(virt_to_page(pte));
488  		}
489  	} while (pte++, addr += PAGE_SIZE, addr != end);
490  
491  	if (hyp_pte_table_empty(start_pte))
492  		clear_hyp_pmd_entry(pmd);
493  }
494  
unmap_hyp_pmds(pud_t * pud,phys_addr_t addr,phys_addr_t end)495  static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
496  {
497  	phys_addr_t next;
498  	pmd_t *pmd, *start_pmd;
499  
500  	start_pmd = pmd = pmd_offset(pud, addr);
501  	do {
502  		next = pmd_addr_end(addr, end);
503  		/* Hyp doesn't use huge pmds */
504  		if (!pmd_none(*pmd))
505  			unmap_hyp_ptes(pmd, addr, next);
506  	} while (pmd++, addr = next, addr != end);
507  
508  	if (hyp_pmd_table_empty(start_pmd))
509  		clear_hyp_pud_entry(pud);
510  }
511  
unmap_hyp_puds(pgd_t * pgd,phys_addr_t addr,phys_addr_t end)512  static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
513  {
514  	phys_addr_t next;
515  	pud_t *pud, *start_pud;
516  
517  	start_pud = pud = pud_offset(pgd, addr);
518  	do {
519  		next = pud_addr_end(addr, end);
520  		/* Hyp doesn't use huge puds */
521  		if (!pud_none(*pud))
522  			unmap_hyp_pmds(pud, addr, next);
523  	} while (pud++, addr = next, addr != end);
524  
525  	if (hyp_pud_table_empty(start_pud))
526  		clear_hyp_pgd_entry(pgd);
527  }
528  
kvm_pgd_index(unsigned long addr,unsigned int ptrs_per_pgd)529  static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
530  {
531  	return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
532  }
533  
__unmap_hyp_range(pgd_t * pgdp,unsigned long ptrs_per_pgd,phys_addr_t start,u64 size)534  static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
535  			      phys_addr_t start, u64 size)
536  {
537  	pgd_t *pgd;
538  	phys_addr_t addr = start, end = start + size;
539  	phys_addr_t next;
540  
541  	/*
542  	 * We don't unmap anything from HYP, except at the hyp tear down.
543  	 * Hence, we don't have to invalidate the TLBs here.
544  	 */
545  	pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
546  	do {
547  		next = pgd_addr_end(addr, end);
548  		if (!pgd_none(*pgd))
549  			unmap_hyp_puds(pgd, addr, next);
550  	} while (pgd++, addr = next, addr != end);
551  }
552  
unmap_hyp_range(pgd_t * pgdp,phys_addr_t start,u64 size)553  static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
554  {
555  	__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
556  }
557  
unmap_hyp_idmap_range(pgd_t * pgdp,phys_addr_t start,u64 size)558  static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
559  {
560  	__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
561  }
562  
563  /**
564   * free_hyp_pgds - free Hyp-mode page tables
565   *
566   * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
567   * therefore contains either mappings in the kernel memory area (above
568   * PAGE_OFFSET), or device mappings in the idmap range.
569   *
570   * boot_hyp_pgd should only map the idmap range, and is only used in
571   * the extended idmap case.
572   */
free_hyp_pgds(void)573  void free_hyp_pgds(void)
574  {
575  	pgd_t *id_pgd;
576  
577  	mutex_lock(&kvm_hyp_pgd_mutex);
578  
579  	id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
580  
581  	if (id_pgd) {
582  		/* In case we never called hyp_mmu_init() */
583  		if (!io_map_base)
584  			io_map_base = hyp_idmap_start;
585  		unmap_hyp_idmap_range(id_pgd, io_map_base,
586  				      hyp_idmap_start + PAGE_SIZE - io_map_base);
587  	}
588  
589  	if (boot_hyp_pgd) {
590  		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
591  		boot_hyp_pgd = NULL;
592  	}
593  
594  	if (hyp_pgd) {
595  		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
596  				(uintptr_t)high_memory - PAGE_OFFSET);
597  
598  		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
599  		hyp_pgd = NULL;
600  	}
601  	if (merged_hyp_pgd) {
602  		clear_page(merged_hyp_pgd);
603  		free_page((unsigned long)merged_hyp_pgd);
604  		merged_hyp_pgd = NULL;
605  	}
606  
607  	mutex_unlock(&kvm_hyp_pgd_mutex);
608  }
609  
create_hyp_pte_mappings(pmd_t * pmd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)610  static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
611  				    unsigned long end, unsigned long pfn,
612  				    pgprot_t prot)
613  {
614  	pte_t *pte;
615  	unsigned long addr;
616  
617  	addr = start;
618  	do {
619  		pte = pte_offset_kernel(pmd, addr);
620  		kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
621  		get_page(virt_to_page(pte));
622  		pfn++;
623  	} while (addr += PAGE_SIZE, addr != end);
624  }
625  
create_hyp_pmd_mappings(pud_t * pud,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)626  static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
627  				   unsigned long end, unsigned long pfn,
628  				   pgprot_t prot)
629  {
630  	pmd_t *pmd;
631  	pte_t *pte;
632  	unsigned long addr, next;
633  
634  	addr = start;
635  	do {
636  		pmd = pmd_offset(pud, addr);
637  
638  		BUG_ON(pmd_sect(*pmd));
639  
640  		if (pmd_none(*pmd)) {
641  			pte = pte_alloc_one_kernel(NULL);
642  			if (!pte) {
643  				kvm_err("Cannot allocate Hyp pte\n");
644  				return -ENOMEM;
645  			}
646  			kvm_pmd_populate(pmd, pte);
647  			get_page(virt_to_page(pmd));
648  		}
649  
650  		next = pmd_addr_end(addr, end);
651  
652  		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
653  		pfn += (next - addr) >> PAGE_SHIFT;
654  	} while (addr = next, addr != end);
655  
656  	return 0;
657  }
658  
create_hyp_pud_mappings(pgd_t * pgd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)659  static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
660  				   unsigned long end, unsigned long pfn,
661  				   pgprot_t prot)
662  {
663  	pud_t *pud;
664  	pmd_t *pmd;
665  	unsigned long addr, next;
666  	int ret;
667  
668  	addr = start;
669  	do {
670  		pud = pud_offset(pgd, addr);
671  
672  		if (pud_none_or_clear_bad(pud)) {
673  			pmd = pmd_alloc_one(NULL, addr);
674  			if (!pmd) {
675  				kvm_err("Cannot allocate Hyp pmd\n");
676  				return -ENOMEM;
677  			}
678  			kvm_pud_populate(pud, pmd);
679  			get_page(virt_to_page(pud));
680  		}
681  
682  		next = pud_addr_end(addr, end);
683  		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
684  		if (ret)
685  			return ret;
686  		pfn += (next - addr) >> PAGE_SHIFT;
687  	} while (addr = next, addr != end);
688  
689  	return 0;
690  }
691  
__create_hyp_mappings(pgd_t * pgdp,unsigned long ptrs_per_pgd,unsigned long start,unsigned long end,unsigned long pfn,pgprot_t prot)692  static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
693  				 unsigned long start, unsigned long end,
694  				 unsigned long pfn, pgprot_t prot)
695  {
696  	pgd_t *pgd;
697  	pud_t *pud;
698  	unsigned long addr, next;
699  	int err = 0;
700  
701  	mutex_lock(&kvm_hyp_pgd_mutex);
702  	addr = start & PAGE_MASK;
703  	end = PAGE_ALIGN(end);
704  	do {
705  		pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
706  
707  		if (pgd_none(*pgd)) {
708  			pud = pud_alloc_one(NULL, addr);
709  			if (!pud) {
710  				kvm_err("Cannot allocate Hyp pud\n");
711  				err = -ENOMEM;
712  				goto out;
713  			}
714  			kvm_pgd_populate(pgd, pud);
715  			get_page(virt_to_page(pgd));
716  		}
717  
718  		next = pgd_addr_end(addr, end);
719  		err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
720  		if (err)
721  			goto out;
722  		pfn += (next - addr) >> PAGE_SHIFT;
723  	} while (addr = next, addr != end);
724  out:
725  	mutex_unlock(&kvm_hyp_pgd_mutex);
726  	return err;
727  }
728  
kvm_kaddr_to_phys(void * kaddr)729  static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
730  {
731  	if (!is_vmalloc_addr(kaddr)) {
732  		BUG_ON(!virt_addr_valid(kaddr));
733  		return __pa(kaddr);
734  	} else {
735  		return page_to_phys(vmalloc_to_page(kaddr)) +
736  		       offset_in_page(kaddr);
737  	}
738  }
739  
740  /**
741   * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
742   * @from:	The virtual kernel start address of the range
743   * @to:		The virtual kernel end address of the range (exclusive)
744   * @prot:	The protection to be applied to this range
745   *
746   * The same virtual address as the kernel virtual address is also used
747   * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
748   * physical pages.
749   */
create_hyp_mappings(void * from,void * to,pgprot_t prot)750  int create_hyp_mappings(void *from, void *to, pgprot_t prot)
751  {
752  	phys_addr_t phys_addr;
753  	unsigned long virt_addr;
754  	unsigned long start = kern_hyp_va((unsigned long)from);
755  	unsigned long end = kern_hyp_va((unsigned long)to);
756  
757  	if (is_kernel_in_hyp_mode())
758  		return 0;
759  
760  	start = start & PAGE_MASK;
761  	end = PAGE_ALIGN(end);
762  
763  	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
764  		int err;
765  
766  		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
767  		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
768  					    virt_addr, virt_addr + PAGE_SIZE,
769  					    __phys_to_pfn(phys_addr),
770  					    prot);
771  		if (err)
772  			return err;
773  	}
774  
775  	return 0;
776  }
777  
__create_hyp_private_mapping(phys_addr_t phys_addr,size_t size,unsigned long * haddr,pgprot_t prot)778  static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
779  					unsigned long *haddr, pgprot_t prot)
780  {
781  	pgd_t *pgd = hyp_pgd;
782  	unsigned long base;
783  	int ret = 0;
784  
785  	mutex_lock(&kvm_hyp_pgd_mutex);
786  
787  	/*
788  	 * This assumes that we we have enough space below the idmap
789  	 * page to allocate our VAs. If not, the check below will
790  	 * kick. A potential alternative would be to detect that
791  	 * overflow and switch to an allocation above the idmap.
792  	 *
793  	 * The allocated size is always a multiple of PAGE_SIZE.
794  	 */
795  	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
796  	base = io_map_base - size;
797  
798  	/*
799  	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
800  	 * allocating the new area, as it would indicate we've
801  	 * overflowed the idmap/IO address range.
802  	 */
803  	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
804  		ret = -ENOMEM;
805  	else
806  		io_map_base = base;
807  
808  	mutex_unlock(&kvm_hyp_pgd_mutex);
809  
810  	if (ret)
811  		goto out;
812  
813  	if (__kvm_cpu_uses_extended_idmap())
814  		pgd = boot_hyp_pgd;
815  
816  	ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
817  				    base, base + size,
818  				    __phys_to_pfn(phys_addr), prot);
819  	if (ret)
820  		goto out;
821  
822  	*haddr = base + offset_in_page(phys_addr);
823  
824  out:
825  	return ret;
826  }
827  
828  /**
829   * create_hyp_io_mappings - Map IO into both kernel and HYP
830   * @phys_addr:	The physical start address which gets mapped
831   * @size:	Size of the region being mapped
832   * @kaddr:	Kernel VA for this mapping
833   * @haddr:	HYP VA for this mapping
834   */
create_hyp_io_mappings(phys_addr_t phys_addr,size_t size,void __iomem ** kaddr,void __iomem ** haddr)835  int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
836  			   void __iomem **kaddr,
837  			   void __iomem **haddr)
838  {
839  	unsigned long addr;
840  	int ret;
841  
842  	*kaddr = ioremap(phys_addr, size);
843  	if (!*kaddr)
844  		return -ENOMEM;
845  
846  	if (is_kernel_in_hyp_mode()) {
847  		*haddr = *kaddr;
848  		return 0;
849  	}
850  
851  	ret = __create_hyp_private_mapping(phys_addr, size,
852  					   &addr, PAGE_HYP_DEVICE);
853  	if (ret) {
854  		iounmap(*kaddr);
855  		*kaddr = NULL;
856  		*haddr = NULL;
857  		return ret;
858  	}
859  
860  	*haddr = (void __iomem *)addr;
861  	return 0;
862  }
863  
864  /**
865   * create_hyp_exec_mappings - Map an executable range into HYP
866   * @phys_addr:	The physical start address which gets mapped
867   * @size:	Size of the region being mapped
868   * @haddr:	HYP VA for this mapping
869   */
create_hyp_exec_mappings(phys_addr_t phys_addr,size_t size,void ** haddr)870  int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
871  			     void **haddr)
872  {
873  	unsigned long addr;
874  	int ret;
875  
876  	BUG_ON(is_kernel_in_hyp_mode());
877  
878  	ret = __create_hyp_private_mapping(phys_addr, size,
879  					   &addr, PAGE_HYP_EXEC);
880  	if (ret) {
881  		*haddr = NULL;
882  		return ret;
883  	}
884  
885  	*haddr = (void *)addr;
886  	return 0;
887  }
888  
889  /**
890   * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
891   * @kvm:	The KVM struct pointer for the VM.
892   *
893   * Allocates only the stage-2 HW PGD level table(s) of size defined by
894   * stage2_pgd_size(kvm).
895   *
896   * Note we don't need locking here as this is only called when the VM is
897   * created, which can only be done once.
898   */
kvm_alloc_stage2_pgd(struct kvm * kvm)899  int kvm_alloc_stage2_pgd(struct kvm *kvm)
900  {
901  	phys_addr_t pgd_phys;
902  	pgd_t *pgd;
903  
904  	if (kvm->arch.pgd != NULL) {
905  		kvm_err("kvm_arch already initialized?\n");
906  		return -EINVAL;
907  	}
908  
909  	/* Allocate the HW PGD, making sure that each page gets its own refcount */
910  	pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
911  	if (!pgd)
912  		return -ENOMEM;
913  
914  	pgd_phys = virt_to_phys(pgd);
915  	if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
916  		return -EINVAL;
917  
918  	kvm->arch.pgd = pgd;
919  	kvm->arch.pgd_phys = pgd_phys;
920  	return 0;
921  }
922  
stage2_unmap_memslot(struct kvm * kvm,struct kvm_memory_slot * memslot)923  static void stage2_unmap_memslot(struct kvm *kvm,
924  				 struct kvm_memory_slot *memslot)
925  {
926  	hva_t hva = memslot->userspace_addr;
927  	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
928  	phys_addr_t size = PAGE_SIZE * memslot->npages;
929  	hva_t reg_end = hva + size;
930  
931  	/*
932  	 * A memory region could potentially cover multiple VMAs, and any holes
933  	 * between them, so iterate over all of them to find out if we should
934  	 * unmap any of them.
935  	 *
936  	 *     +--------------------------------------------+
937  	 * +---------------+----------------+   +----------------+
938  	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
939  	 * +---------------+----------------+   +----------------+
940  	 *     |               memory region                |
941  	 *     +--------------------------------------------+
942  	 */
943  	do {
944  		struct vm_area_struct *vma = find_vma(current->mm, hva);
945  		hva_t vm_start, vm_end;
946  
947  		if (!vma || vma->vm_start >= reg_end)
948  			break;
949  
950  		/*
951  		 * Take the intersection of this VMA with the memory region
952  		 */
953  		vm_start = max(hva, vma->vm_start);
954  		vm_end = min(reg_end, vma->vm_end);
955  
956  		if (!(vma->vm_flags & VM_PFNMAP)) {
957  			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
958  			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
959  		}
960  		hva = vm_end;
961  	} while (hva < reg_end);
962  }
963  
964  /**
965   * stage2_unmap_vm - Unmap Stage-2 RAM mappings
966   * @kvm: The struct kvm pointer
967   *
968   * Go through the memregions and unmap any reguler RAM
969   * backing memory already mapped to the VM.
970   */
stage2_unmap_vm(struct kvm * kvm)971  void stage2_unmap_vm(struct kvm *kvm)
972  {
973  	struct kvm_memslots *slots;
974  	struct kvm_memory_slot *memslot;
975  	int idx;
976  
977  	idx = srcu_read_lock(&kvm->srcu);
978  	down_read(&current->mm->mmap_sem);
979  	spin_lock(&kvm->mmu_lock);
980  
981  	slots = kvm_memslots(kvm);
982  	kvm_for_each_memslot(memslot, slots)
983  		stage2_unmap_memslot(kvm, memslot);
984  
985  	spin_unlock(&kvm->mmu_lock);
986  	up_read(&current->mm->mmap_sem);
987  	srcu_read_unlock(&kvm->srcu, idx);
988  }
989  
990  /**
991   * kvm_free_stage2_pgd - free all stage-2 tables
992   * @kvm:	The KVM struct pointer for the VM.
993   *
994   * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
995   * underlying level-2 and level-3 tables before freeing the actual level-1 table
996   * and setting the struct pointer to NULL.
997   */
kvm_free_stage2_pgd(struct kvm * kvm)998  void kvm_free_stage2_pgd(struct kvm *kvm)
999  {
1000  	void *pgd = NULL;
1001  
1002  	spin_lock(&kvm->mmu_lock);
1003  	if (kvm->arch.pgd) {
1004  		unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
1005  		pgd = READ_ONCE(kvm->arch.pgd);
1006  		kvm->arch.pgd = NULL;
1007  		kvm->arch.pgd_phys = 0;
1008  	}
1009  	spin_unlock(&kvm->mmu_lock);
1010  
1011  	/* Free the HW pgd, one page at a time */
1012  	if (pgd)
1013  		free_pages_exact(pgd, stage2_pgd_size(kvm));
1014  }
1015  
stage2_get_pud(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr)1016  static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1017  			     phys_addr_t addr)
1018  {
1019  	pgd_t *pgd;
1020  	pud_t *pud;
1021  
1022  	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
1023  	if (stage2_pgd_none(kvm, *pgd)) {
1024  		if (!cache)
1025  			return NULL;
1026  		pud = mmu_memory_cache_alloc(cache);
1027  		stage2_pgd_populate(kvm, pgd, pud);
1028  		get_page(virt_to_page(pgd));
1029  	}
1030  
1031  	return stage2_pud_offset(kvm, pgd, addr);
1032  }
1033  
stage2_get_pmd(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr)1034  static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1035  			     phys_addr_t addr)
1036  {
1037  	pud_t *pud;
1038  	pmd_t *pmd;
1039  
1040  	pud = stage2_get_pud(kvm, cache, addr);
1041  	if (!pud || stage2_pud_huge(kvm, *pud))
1042  		return NULL;
1043  
1044  	if (stage2_pud_none(kvm, *pud)) {
1045  		if (!cache)
1046  			return NULL;
1047  		pmd = mmu_memory_cache_alloc(cache);
1048  		stage2_pud_populate(kvm, pud, pmd);
1049  		get_page(virt_to_page(pud));
1050  	}
1051  
1052  	return stage2_pmd_offset(kvm, pud, addr);
1053  }
1054  
stage2_set_pmd_huge(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pmd_t * new_pmd)1055  static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
1056  			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
1057  {
1058  	pmd_t *pmd, old_pmd;
1059  
1060  retry:
1061  	pmd = stage2_get_pmd(kvm, cache, addr);
1062  	VM_BUG_ON(!pmd);
1063  
1064  	old_pmd = *pmd;
1065  	/*
1066  	 * Multiple vcpus faulting on the same PMD entry, can
1067  	 * lead to them sequentially updating the PMD with the
1068  	 * same value. Following the break-before-make
1069  	 * (pmd_clear() followed by tlb_flush()) process can
1070  	 * hinder forward progress due to refaults generated
1071  	 * on missing translations.
1072  	 *
1073  	 * Skip updating the page table if the entry is
1074  	 * unchanged.
1075  	 */
1076  	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
1077  		return 0;
1078  
1079  	if (pmd_present(old_pmd)) {
1080  		/*
1081  		 * If we already have PTE level mapping for this block,
1082  		 * we must unmap it to avoid inconsistent TLB state and
1083  		 * leaking the table page. We could end up in this situation
1084  		 * if the memory slot was marked for dirty logging and was
1085  		 * reverted, leaving PTE level mappings for the pages accessed
1086  		 * during the period. So, unmap the PTE level mapping for this
1087  		 * block and retry, as we could have released the upper level
1088  		 * table in the process.
1089  		 *
1090  		 * Normal THP split/merge follows mmu_notifier callbacks and do
1091  		 * get handled accordingly.
1092  		 */
1093  		if (!pmd_thp_or_huge(old_pmd)) {
1094  			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
1095  			goto retry;
1096  		}
1097  		/*
1098  		 * Mapping in huge pages should only happen through a
1099  		 * fault.  If a page is merged into a transparent huge
1100  		 * page, the individual subpages of that huge page
1101  		 * should be unmapped through MMU notifiers before we
1102  		 * get here.
1103  		 *
1104  		 * Merging of CompoundPages is not supported; they
1105  		 * should become splitting first, unmapped, merged,
1106  		 * and mapped back in on-demand.
1107  		 */
1108  		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
1109  		pmd_clear(pmd);
1110  		kvm_tlb_flush_vmid_ipa(kvm, addr);
1111  	} else {
1112  		get_page(virt_to_page(pmd));
1113  	}
1114  
1115  	kvm_set_pmd(pmd, *new_pmd);
1116  	return 0;
1117  }
1118  
stage2_set_pud_huge(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pud_t * new_pudp)1119  static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1120  			       phys_addr_t addr, const pud_t *new_pudp)
1121  {
1122  	pud_t *pudp, old_pud;
1123  
1124  retry:
1125  	pudp = stage2_get_pud(kvm, cache, addr);
1126  	VM_BUG_ON(!pudp);
1127  
1128  	old_pud = *pudp;
1129  
1130  	/*
1131  	 * A large number of vcpus faulting on the same stage 2 entry,
1132  	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1133  	 * Skip updating the page tables if there is no change.
1134  	 */
1135  	if (pud_val(old_pud) == pud_val(*new_pudp))
1136  		return 0;
1137  
1138  	if (stage2_pud_present(kvm, old_pud)) {
1139  		/*
1140  		 * If we already have table level mapping for this block, unmap
1141  		 * the range for this block and retry.
1142  		 */
1143  		if (!stage2_pud_huge(kvm, old_pud)) {
1144  			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
1145  			goto retry;
1146  		}
1147  
1148  		WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
1149  		stage2_pud_clear(kvm, pudp);
1150  		kvm_tlb_flush_vmid_ipa(kvm, addr);
1151  	} else {
1152  		get_page(virt_to_page(pudp));
1153  	}
1154  
1155  	kvm_set_pud(pudp, *new_pudp);
1156  	return 0;
1157  }
1158  
1159  /*
1160   * stage2_get_leaf_entry - walk the stage2 VM page tables and return
1161   * true if a valid and present leaf-entry is found. A pointer to the
1162   * leaf-entry is returned in the appropriate level variable - pudpp,
1163   * pmdpp, ptepp.
1164   */
stage2_get_leaf_entry(struct kvm * kvm,phys_addr_t addr,pud_t ** pudpp,pmd_t ** pmdpp,pte_t ** ptepp)1165  static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
1166  				  pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
1167  {
1168  	pud_t *pudp;
1169  	pmd_t *pmdp;
1170  	pte_t *ptep;
1171  
1172  	*pudpp = NULL;
1173  	*pmdpp = NULL;
1174  	*ptepp = NULL;
1175  
1176  	pudp = stage2_get_pud(kvm, NULL, addr);
1177  	if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1178  		return false;
1179  
1180  	if (stage2_pud_huge(kvm, *pudp)) {
1181  		*pudpp = pudp;
1182  		return true;
1183  	}
1184  
1185  	pmdp = stage2_pmd_offset(kvm, pudp, addr);
1186  	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1187  		return false;
1188  
1189  	if (pmd_thp_or_huge(*pmdp)) {
1190  		*pmdpp = pmdp;
1191  		return true;
1192  	}
1193  
1194  	ptep = pte_offset_kernel(pmdp, addr);
1195  	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1196  		return false;
1197  
1198  	*ptepp = ptep;
1199  	return true;
1200  }
1201  
stage2_is_exec(struct kvm * kvm,phys_addr_t addr)1202  static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
1203  {
1204  	pud_t *pudp;
1205  	pmd_t *pmdp;
1206  	pte_t *ptep;
1207  	bool found;
1208  
1209  	found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
1210  	if (!found)
1211  		return false;
1212  
1213  	if (pudp)
1214  		return kvm_s2pud_exec(pudp);
1215  	else if (pmdp)
1216  		return kvm_s2pmd_exec(pmdp);
1217  	else
1218  		return kvm_s2pte_exec(ptep);
1219  }
1220  
stage2_set_pte(struct kvm * kvm,struct kvm_mmu_memory_cache * cache,phys_addr_t addr,const pte_t * new_pte,unsigned long flags)1221  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1222  			  phys_addr_t addr, const pte_t *new_pte,
1223  			  unsigned long flags)
1224  {
1225  	pud_t *pud;
1226  	pmd_t *pmd;
1227  	pte_t *pte, old_pte;
1228  	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1229  	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1230  
1231  	VM_BUG_ON(logging_active && !cache);
1232  
1233  	/* Create stage-2 page table mapping - Levels 0 and 1 */
1234  	pud = stage2_get_pud(kvm, cache, addr);
1235  	if (!pud) {
1236  		/*
1237  		 * Ignore calls from kvm_set_spte_hva for unallocated
1238  		 * address ranges.
1239  		 */
1240  		return 0;
1241  	}
1242  
1243  	/*
1244  	 * While dirty page logging - dissolve huge PUD, then continue
1245  	 * on to allocate page.
1246  	 */
1247  	if (logging_active)
1248  		stage2_dissolve_pud(kvm, addr, pud);
1249  
1250  	if (stage2_pud_none(kvm, *pud)) {
1251  		if (!cache)
1252  			return 0; /* ignore calls from kvm_set_spte_hva */
1253  		pmd = mmu_memory_cache_alloc(cache);
1254  		stage2_pud_populate(kvm, pud, pmd);
1255  		get_page(virt_to_page(pud));
1256  	}
1257  
1258  	pmd = stage2_pmd_offset(kvm, pud, addr);
1259  	if (!pmd) {
1260  		/*
1261  		 * Ignore calls from kvm_set_spte_hva for unallocated
1262  		 * address ranges.
1263  		 */
1264  		return 0;
1265  	}
1266  
1267  	/*
1268  	 * While dirty page logging - dissolve huge PMD, then continue on to
1269  	 * allocate page.
1270  	 */
1271  	if (logging_active)
1272  		stage2_dissolve_pmd(kvm, addr, pmd);
1273  
1274  	/* Create stage-2 page mappings - Level 2 */
1275  	if (pmd_none(*pmd)) {
1276  		if (!cache)
1277  			return 0; /* ignore calls from kvm_set_spte_hva */
1278  		pte = mmu_memory_cache_alloc(cache);
1279  		kvm_pmd_populate(pmd, pte);
1280  		get_page(virt_to_page(pmd));
1281  	}
1282  
1283  	pte = pte_offset_kernel(pmd, addr);
1284  
1285  	if (iomap && pte_present(*pte))
1286  		return -EFAULT;
1287  
1288  	/* Create 2nd stage page table mapping - Level 3 */
1289  	old_pte = *pte;
1290  	if (pte_present(old_pte)) {
1291  		/* Skip page table update if there is no change */
1292  		if (pte_val(old_pte) == pte_val(*new_pte))
1293  			return 0;
1294  
1295  		kvm_set_pte(pte, __pte(0));
1296  		kvm_tlb_flush_vmid_ipa(kvm, addr);
1297  	} else {
1298  		get_page(virt_to_page(pte));
1299  	}
1300  
1301  	kvm_set_pte(pte, *new_pte);
1302  	return 0;
1303  }
1304  
1305  #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
stage2_ptep_test_and_clear_young(pte_t * pte)1306  static int stage2_ptep_test_and_clear_young(pte_t *pte)
1307  {
1308  	if (pte_young(*pte)) {
1309  		*pte = pte_mkold(*pte);
1310  		return 1;
1311  	}
1312  	return 0;
1313  }
1314  #else
stage2_ptep_test_and_clear_young(pte_t * pte)1315  static int stage2_ptep_test_and_clear_young(pte_t *pte)
1316  {
1317  	return __ptep_test_and_clear_young(pte);
1318  }
1319  #endif
1320  
stage2_pmdp_test_and_clear_young(pmd_t * pmd)1321  static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1322  {
1323  	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1324  }
1325  
stage2_pudp_test_and_clear_young(pud_t * pud)1326  static int stage2_pudp_test_and_clear_young(pud_t *pud)
1327  {
1328  	return stage2_ptep_test_and_clear_young((pte_t *)pud);
1329  }
1330  
1331  /**
1332   * kvm_phys_addr_ioremap - map a device range to guest IPA
1333   *
1334   * @kvm:	The KVM pointer
1335   * @guest_ipa:	The IPA at which to insert the mapping
1336   * @pa:		The physical address of the device
1337   * @size:	The size of the mapping
1338   */
kvm_phys_addr_ioremap(struct kvm * kvm,phys_addr_t guest_ipa,phys_addr_t pa,unsigned long size,bool writable)1339  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1340  			  phys_addr_t pa, unsigned long size, bool writable)
1341  {
1342  	phys_addr_t addr, end;
1343  	int ret = 0;
1344  	unsigned long pfn;
1345  	struct kvm_mmu_memory_cache cache = { 0, };
1346  
1347  	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
1348  	pfn = __phys_to_pfn(pa);
1349  
1350  	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
1351  		pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
1352  
1353  		if (writable)
1354  			pte = kvm_s2pte_mkwrite(pte);
1355  
1356  		ret = mmu_topup_memory_cache(&cache,
1357  					     kvm_mmu_cache_min_pages(kvm),
1358  					     KVM_NR_MEM_OBJS);
1359  		if (ret)
1360  			goto out;
1361  		spin_lock(&kvm->mmu_lock);
1362  		ret = stage2_set_pte(kvm, &cache, addr, &pte,
1363  						KVM_S2PTE_FLAG_IS_IOMAP);
1364  		spin_unlock(&kvm->mmu_lock);
1365  		if (ret)
1366  			goto out;
1367  
1368  		pfn++;
1369  	}
1370  
1371  out:
1372  	mmu_free_memory_cache(&cache);
1373  	return ret;
1374  }
1375  
transparent_hugepage_adjust(kvm_pfn_t * pfnp,phys_addr_t * ipap)1376  static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
1377  {
1378  	kvm_pfn_t pfn = *pfnp;
1379  	gfn_t gfn = *ipap >> PAGE_SHIFT;
1380  	struct page *page = pfn_to_page(pfn);
1381  
1382  	/*
1383  	 * PageTransCompoundMap() returns true for THP and
1384  	 * hugetlbfs. Make sure the adjustment is done only for THP
1385  	 * pages.
1386  	 */
1387  	if (!PageHuge(page) && PageTransCompoundMap(page)) {
1388  		unsigned long mask;
1389  		/*
1390  		 * The address we faulted on is backed by a transparent huge
1391  		 * page.  However, because we map the compound huge page and
1392  		 * not the individual tail page, we need to transfer the
1393  		 * refcount to the head page.  We have to be careful that the
1394  		 * THP doesn't start to split while we are adjusting the
1395  		 * refcounts.
1396  		 *
1397  		 * We are sure this doesn't happen, because mmu_notifier_retry
1398  		 * was successful and we are holding the mmu_lock, so if this
1399  		 * THP is trying to split, it will be blocked in the mmu
1400  		 * notifier before touching any of the pages, specifically
1401  		 * before being able to call __split_huge_page_refcount().
1402  		 *
1403  		 * We can therefore safely transfer the refcount from PG_tail
1404  		 * to PG_head and switch the pfn from a tail page to the head
1405  		 * page accordingly.
1406  		 */
1407  		mask = PTRS_PER_PMD - 1;
1408  		VM_BUG_ON((gfn & mask) != (pfn & mask));
1409  		if (pfn & mask) {
1410  			*ipap &= PMD_MASK;
1411  			kvm_release_pfn_clean(pfn);
1412  			pfn &= ~mask;
1413  			kvm_get_pfn(pfn);
1414  			*pfnp = pfn;
1415  		}
1416  
1417  		return true;
1418  	}
1419  
1420  	return false;
1421  }
1422  
1423  /**
1424   * stage2_wp_ptes - write protect PMD range
1425   * @pmd:	pointer to pmd entry
1426   * @addr:	range start address
1427   * @end:	range end address
1428   */
stage2_wp_ptes(pmd_t * pmd,phys_addr_t addr,phys_addr_t end)1429  static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1430  {
1431  	pte_t *pte;
1432  
1433  	pte = pte_offset_kernel(pmd, addr);
1434  	do {
1435  		if (!pte_none(*pte)) {
1436  			if (!kvm_s2pte_readonly(pte))
1437  				kvm_set_s2pte_readonly(pte);
1438  		}
1439  	} while (pte++, addr += PAGE_SIZE, addr != end);
1440  }
1441  
1442  /**
1443   * stage2_wp_pmds - write protect PUD range
1444   * kvm:		kvm instance for the VM
1445   * @pud:	pointer to pud entry
1446   * @addr:	range start address
1447   * @end:	range end address
1448   */
stage2_wp_pmds(struct kvm * kvm,pud_t * pud,phys_addr_t addr,phys_addr_t end)1449  static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
1450  			   phys_addr_t addr, phys_addr_t end)
1451  {
1452  	pmd_t *pmd;
1453  	phys_addr_t next;
1454  
1455  	pmd = stage2_pmd_offset(kvm, pud, addr);
1456  
1457  	do {
1458  		next = stage2_pmd_addr_end(kvm, addr, end);
1459  		if (!pmd_none(*pmd)) {
1460  			if (pmd_thp_or_huge(*pmd)) {
1461  				if (!kvm_s2pmd_readonly(pmd))
1462  					kvm_set_s2pmd_readonly(pmd);
1463  			} else {
1464  				stage2_wp_ptes(pmd, addr, next);
1465  			}
1466  		}
1467  	} while (pmd++, addr = next, addr != end);
1468  }
1469  
1470  /**
1471   * stage2_wp_puds - write protect PGD range
1472   * @pgd:	pointer to pgd entry
1473   * @addr:	range start address
1474   * @end:	range end address
1475   */
stage2_wp_puds(struct kvm * kvm,pgd_t * pgd,phys_addr_t addr,phys_addr_t end)1476  static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
1477  			    phys_addr_t addr, phys_addr_t end)
1478  {
1479  	pud_t *pud;
1480  	phys_addr_t next;
1481  
1482  	pud = stage2_pud_offset(kvm, pgd, addr);
1483  	do {
1484  		next = stage2_pud_addr_end(kvm, addr, end);
1485  		if (!stage2_pud_none(kvm, *pud)) {
1486  			if (stage2_pud_huge(kvm, *pud)) {
1487  				if (!kvm_s2pud_readonly(pud))
1488  					kvm_set_s2pud_readonly(pud);
1489  			} else {
1490  				stage2_wp_pmds(kvm, pud, addr, next);
1491  			}
1492  		}
1493  	} while (pud++, addr = next, addr != end);
1494  }
1495  
1496  /**
1497   * stage2_wp_range() - write protect stage2 memory region range
1498   * @kvm:	The KVM pointer
1499   * @addr:	Start address of range
1500   * @end:	End address of range
1501   */
stage2_wp_range(struct kvm * kvm,phys_addr_t addr,phys_addr_t end)1502  static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1503  {
1504  	pgd_t *pgd;
1505  	phys_addr_t next;
1506  
1507  	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
1508  	do {
1509  		/*
1510  		 * Release kvm_mmu_lock periodically if the memory region is
1511  		 * large. Otherwise, we may see kernel panics with
1512  		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1513  		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1514  		 * will also starve other vCPUs. We have to also make sure
1515  		 * that the page tables are not freed while we released
1516  		 * the lock.
1517  		 */
1518  		cond_resched_lock(&kvm->mmu_lock);
1519  		if (!READ_ONCE(kvm->arch.pgd))
1520  			break;
1521  		next = stage2_pgd_addr_end(kvm, addr, end);
1522  		if (stage2_pgd_present(kvm, *pgd))
1523  			stage2_wp_puds(kvm, pgd, addr, next);
1524  	} while (pgd++, addr = next, addr != end);
1525  }
1526  
1527  /**
1528   * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1529   * @kvm:	The KVM pointer
1530   * @slot:	The memory slot to write protect
1531   *
1532   * Called to start logging dirty pages after memory region
1533   * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1534   * all present PUD, PMD and PTEs are write protected in the memory region.
1535   * Afterwards read of dirty page log can be called.
1536   *
1537   * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1538   * serializing operations for VM memory regions.
1539   */
kvm_mmu_wp_memory_region(struct kvm * kvm,int slot)1540  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1541  {
1542  	struct kvm_memslots *slots = kvm_memslots(kvm);
1543  	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1544  	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
1545  	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1546  
1547  	spin_lock(&kvm->mmu_lock);
1548  	stage2_wp_range(kvm, start, end);
1549  	spin_unlock(&kvm->mmu_lock);
1550  	kvm_flush_remote_tlbs(kvm);
1551  }
1552  
1553  /**
1554   * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1555   * @kvm:	The KVM pointer
1556   * @slot:	The memory slot associated with mask
1557   * @gfn_offset:	The gfn offset in memory slot
1558   * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
1559   *		slot to be write protected
1560   *
1561   * Walks bits set in mask write protects the associated pte's. Caller must
1562   * acquire kvm_mmu_lock.
1563   */
kvm_mmu_write_protect_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)1564  static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1565  		struct kvm_memory_slot *slot,
1566  		gfn_t gfn_offset, unsigned long mask)
1567  {
1568  	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1569  	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1570  	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1571  
1572  	stage2_wp_range(kvm, start, end);
1573  }
1574  
1575  /*
1576   * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1577   * dirty pages.
1578   *
1579   * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1580   * enable dirty logging for them.
1581   */
kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn_offset,unsigned long mask)1582  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1583  		struct kvm_memory_slot *slot,
1584  		gfn_t gfn_offset, unsigned long mask)
1585  {
1586  	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1587  }
1588  
clean_dcache_guest_page(kvm_pfn_t pfn,unsigned long size)1589  static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1590  {
1591  	__clean_dcache_guest_page(pfn, size);
1592  }
1593  
invalidate_icache_guest_page(kvm_pfn_t pfn,unsigned long size)1594  static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1595  {
1596  	__invalidate_icache_guest_page(pfn, size);
1597  }
1598  
kvm_send_hwpoison_signal(unsigned long address,struct vm_area_struct * vma)1599  static void kvm_send_hwpoison_signal(unsigned long address,
1600  				     struct vm_area_struct *vma)
1601  {
1602  	short lsb;
1603  
1604  	if (is_vm_hugetlb_page(vma))
1605  		lsb = huge_page_shift(hstate_vma(vma));
1606  	else
1607  		lsb = PAGE_SHIFT;
1608  
1609  	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1610  }
1611  
fault_supports_stage2_huge_mapping(struct kvm_memory_slot * memslot,unsigned long hva,unsigned long map_size)1612  static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1613  					       unsigned long hva,
1614  					       unsigned long map_size)
1615  {
1616  	gpa_t gpa_start;
1617  	hva_t uaddr_start, uaddr_end;
1618  	size_t size;
1619  
1620  	size = memslot->npages * PAGE_SIZE;
1621  
1622  	gpa_start = memslot->base_gfn << PAGE_SHIFT;
1623  
1624  	uaddr_start = memslot->userspace_addr;
1625  	uaddr_end = uaddr_start + size;
1626  
1627  	/*
1628  	 * Pages belonging to memslots that don't have the same alignment
1629  	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1630  	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1631  	 *
1632  	 * Consider a layout like the following:
1633  	 *
1634  	 *    memslot->userspace_addr:
1635  	 *    +-----+--------------------+--------------------+---+
1636  	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1637  	 *    +-----+--------------------+--------------------+---+
1638  	 *
1639  	 *    memslot->base_gfn << PAGE_SIZE:
1640  	 *      +---+--------------------+--------------------+-----+
1641  	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1642  	 *      +---+--------------------+--------------------+-----+
1643  	 *
1644  	 * If we create those stage-2 blocks, we'll end up with this incorrect
1645  	 * mapping:
1646  	 *   d -> f
1647  	 *   e -> g
1648  	 *   f -> h
1649  	 */
1650  	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1651  		return false;
1652  
1653  	/*
1654  	 * Next, let's make sure we're not trying to map anything not covered
1655  	 * by the memslot. This means we have to prohibit block size mappings
1656  	 * for the beginning and end of a non-block aligned and non-block sized
1657  	 * memory slot (illustrated by the head and tail parts of the
1658  	 * userspace view above containing pages 'abcde' and 'xyz',
1659  	 * respectively).
1660  	 *
1661  	 * Note that it doesn't matter if we do the check using the
1662  	 * userspace_addr or the base_gfn, as both are equally aligned (per
1663  	 * the check above) and equally sized.
1664  	 */
1665  	return (hva & ~(map_size - 1)) >= uaddr_start &&
1666  	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1667  }
1668  
user_mem_abort(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa,struct kvm_memory_slot * memslot,unsigned long hva,unsigned long fault_status)1669  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1670  			  struct kvm_memory_slot *memslot, unsigned long hva,
1671  			  unsigned long fault_status)
1672  {
1673  	int ret;
1674  	bool write_fault, writable, force_pte = false;
1675  	bool exec_fault, needs_exec;
1676  	unsigned long mmu_seq;
1677  	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1678  	struct kvm *kvm = vcpu->kvm;
1679  	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1680  	struct vm_area_struct *vma;
1681  	kvm_pfn_t pfn;
1682  	pgprot_t mem_type = PAGE_S2;
1683  	bool logging_active = memslot_is_logging(memslot);
1684  	unsigned long vma_pagesize, flags = 0;
1685  
1686  	write_fault = kvm_is_write_fault(vcpu);
1687  	exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1688  	VM_BUG_ON(write_fault && exec_fault);
1689  
1690  	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1691  		kvm_err("Unexpected L2 read permission error\n");
1692  		return -EFAULT;
1693  	}
1694  
1695  	/* Let's check if we will get back a huge page backed by hugetlbfs */
1696  	down_read(&current->mm->mmap_sem);
1697  	vma = find_vma_intersection(current->mm, hva, hva + 1);
1698  	if (unlikely(!vma)) {
1699  		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1700  		up_read(&current->mm->mmap_sem);
1701  		return -EFAULT;
1702  	}
1703  
1704  	vma_pagesize = vma_kernel_pagesize(vma);
1705  	if (logging_active ||
1706  	    (vma->vm_flags & VM_PFNMAP) ||
1707  	    !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1708  		force_pte = true;
1709  		vma_pagesize = PAGE_SIZE;
1710  	}
1711  
1712  	/*
1713  	 * The stage2 has a minimum of 2 level table (For arm64 see
1714  	 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1715  	 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1716  	 * As for PUD huge maps, we must make sure that we have at least
1717  	 * 3 levels, i.e, PMD is not folded.
1718  	 */
1719  	if (vma_pagesize == PMD_SIZE ||
1720  	    (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1721  		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1722  	up_read(&current->mm->mmap_sem);
1723  
1724  	/* We need minimum second+third level pages */
1725  	ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
1726  				     KVM_NR_MEM_OBJS);
1727  	if (ret)
1728  		return ret;
1729  
1730  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
1731  	/*
1732  	 * Ensure the read of mmu_notifier_seq happens before we call
1733  	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1734  	 * the page we just got a reference to gets unmapped before we have a
1735  	 * chance to grab the mmu_lock, which ensure that if the page gets
1736  	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1737  	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1738  	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1739  	 */
1740  	smp_rmb();
1741  
1742  	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1743  	if (pfn == KVM_PFN_ERR_HWPOISON) {
1744  		kvm_send_hwpoison_signal(hva, vma);
1745  		return 0;
1746  	}
1747  	if (is_error_noslot_pfn(pfn))
1748  		return -EFAULT;
1749  
1750  	if (kvm_is_device_pfn(pfn)) {
1751  		mem_type = PAGE_S2_DEVICE;
1752  		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1753  	} else if (logging_active) {
1754  		/*
1755  		 * Faults on pages in a memslot with logging enabled
1756  		 * should not be mapped with huge pages (it introduces churn
1757  		 * and performance degradation), so force a pte mapping.
1758  		 */
1759  		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1760  
1761  		/*
1762  		 * Only actually map the page as writable if this was a write
1763  		 * fault.
1764  		 */
1765  		if (!write_fault)
1766  			writable = false;
1767  	}
1768  
1769  	if (exec_fault && is_iomap(flags))
1770  		return -ENOEXEC;
1771  
1772  	spin_lock(&kvm->mmu_lock);
1773  	if (mmu_notifier_retry(kvm, mmu_seq))
1774  		goto out_unlock;
1775  
1776  	if (vma_pagesize == PAGE_SIZE && !force_pte) {
1777  		/*
1778  		 * Only PMD_SIZE transparent hugepages(THP) are
1779  		 * currently supported. This code will need to be
1780  		 * updated to support other THP sizes.
1781  		 *
1782  		 * Make sure the host VA and the guest IPA are sufficiently
1783  		 * aligned and that the block is contained within the memslot.
1784  		 */
1785  		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
1786  		    transparent_hugepage_adjust(&pfn, &fault_ipa))
1787  			vma_pagesize = PMD_SIZE;
1788  	}
1789  
1790  	if (writable)
1791  		kvm_set_pfn_dirty(pfn);
1792  
1793  	if (fault_status != FSC_PERM && !is_iomap(flags))
1794  		clean_dcache_guest_page(pfn, vma_pagesize);
1795  
1796  	if (exec_fault)
1797  		invalidate_icache_guest_page(pfn, vma_pagesize);
1798  
1799  	/*
1800  	 * If we took an execution fault we have made the
1801  	 * icache/dcache coherent above and should now let the s2
1802  	 * mapping be executable.
1803  	 *
1804  	 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1805  	 * execute permissions, and we preserve whatever we have.
1806  	 */
1807  	needs_exec = exec_fault ||
1808  		(fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
1809  
1810  	if (vma_pagesize == PUD_SIZE) {
1811  		pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1812  
1813  		new_pud = kvm_pud_mkhuge(new_pud);
1814  		if (writable)
1815  			new_pud = kvm_s2pud_mkwrite(new_pud);
1816  
1817  		if (needs_exec)
1818  			new_pud = kvm_s2pud_mkexec(new_pud);
1819  
1820  		ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
1821  	} else if (vma_pagesize == PMD_SIZE) {
1822  		pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1823  
1824  		new_pmd = kvm_pmd_mkhuge(new_pmd);
1825  
1826  		if (writable)
1827  			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1828  
1829  		if (needs_exec)
1830  			new_pmd = kvm_s2pmd_mkexec(new_pmd);
1831  
1832  		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1833  	} else {
1834  		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1835  
1836  		if (writable) {
1837  			new_pte = kvm_s2pte_mkwrite(new_pte);
1838  			mark_page_dirty(kvm, gfn);
1839  		}
1840  
1841  		if (needs_exec)
1842  			new_pte = kvm_s2pte_mkexec(new_pte);
1843  
1844  		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1845  	}
1846  
1847  out_unlock:
1848  	spin_unlock(&kvm->mmu_lock);
1849  	kvm_set_pfn_accessed(pfn);
1850  	kvm_release_pfn_clean(pfn);
1851  	return ret;
1852  }
1853  
1854  /*
1855   * Resolve the access fault by making the page young again.
1856   * Note that because the faulting entry is guaranteed not to be
1857   * cached in the TLB, we don't need to invalidate anything.
1858   * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
1859   * so there is no need for atomic (pte|pmd)_mkyoung operations.
1860   */
handle_access_fault(struct kvm_vcpu * vcpu,phys_addr_t fault_ipa)1861  static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1862  {
1863  	pud_t *pud;
1864  	pmd_t *pmd;
1865  	pte_t *pte;
1866  	kvm_pfn_t pfn;
1867  	bool pfn_valid = false;
1868  
1869  	trace_kvm_access_fault(fault_ipa);
1870  
1871  	spin_lock(&vcpu->kvm->mmu_lock);
1872  
1873  	if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
1874  		goto out;
1875  
1876  	if (pud) {		/* HugeTLB */
1877  		*pud = kvm_s2pud_mkyoung(*pud);
1878  		pfn = kvm_pud_pfn(*pud);
1879  		pfn_valid = true;
1880  	} else	if (pmd) {	/* THP, HugeTLB */
1881  		*pmd = pmd_mkyoung(*pmd);
1882  		pfn = pmd_pfn(*pmd);
1883  		pfn_valid = true;
1884  	} else {
1885  		*pte = pte_mkyoung(*pte);	/* Just a page... */
1886  		pfn = pte_pfn(*pte);
1887  		pfn_valid = true;
1888  	}
1889  
1890  out:
1891  	spin_unlock(&vcpu->kvm->mmu_lock);
1892  	if (pfn_valid)
1893  		kvm_set_pfn_accessed(pfn);
1894  }
1895  
1896  /**
1897   * kvm_handle_guest_abort - handles all 2nd stage aborts
1898   * @vcpu:	the VCPU pointer
1899   * @run:	the kvm_run structure
1900   *
1901   * Any abort that gets to the host is almost guaranteed to be caused by a
1902   * missing second stage translation table entry, which can mean that either the
1903   * guest simply needs more memory and we must allocate an appropriate page or it
1904   * can mean that the guest tried to access I/O memory, which is emulated by user
1905   * space. The distinction is based on the IPA causing the fault and whether this
1906   * memory region has been registered as standard RAM by user space.
1907   */
kvm_handle_guest_abort(struct kvm_vcpu * vcpu,struct kvm_run * run)1908  int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
1909  {
1910  	unsigned long fault_status;
1911  	phys_addr_t fault_ipa;
1912  	struct kvm_memory_slot *memslot;
1913  	unsigned long hva;
1914  	bool is_iabt, write_fault, writable;
1915  	gfn_t gfn;
1916  	int ret, idx;
1917  
1918  	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1919  
1920  	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1921  	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1922  
1923  	/* Synchronous External Abort? */
1924  	if (kvm_vcpu_dabt_isextabt(vcpu)) {
1925  		/*
1926  		 * For RAS the host kernel may handle this abort.
1927  		 * There is no need to pass the error into the guest.
1928  		 */
1929  		if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
1930  			return 1;
1931  
1932  		if (unlikely(!is_iabt)) {
1933  			kvm_inject_vabt(vcpu);
1934  			return 1;
1935  		}
1936  	}
1937  
1938  	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
1939  			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
1940  
1941  	/* Check the stage-2 fault is trans. fault or write fault */
1942  	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1943  	    fault_status != FSC_ACCESS) {
1944  		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1945  			kvm_vcpu_trap_get_class(vcpu),
1946  			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1947  			(unsigned long)kvm_vcpu_get_hsr(vcpu));
1948  		return -EFAULT;
1949  	}
1950  
1951  	idx = srcu_read_lock(&vcpu->kvm->srcu);
1952  
1953  	gfn = fault_ipa >> PAGE_SHIFT;
1954  	memslot = gfn_to_memslot(vcpu->kvm, gfn);
1955  	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1956  	write_fault = kvm_is_write_fault(vcpu);
1957  	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1958  		if (is_iabt) {
1959  			/* Prefetch Abort on I/O address */
1960  			ret = -ENOEXEC;
1961  			goto out;
1962  		}
1963  
1964  		/*
1965  		 * Check for a cache maintenance operation. Since we
1966  		 * ended-up here, we know it is outside of any memory
1967  		 * slot. But we can't find out if that is for a device,
1968  		 * or if the guest is just being stupid. The only thing
1969  		 * we know for sure is that this range cannot be cached.
1970  		 *
1971  		 * So let's assume that the guest is just being
1972  		 * cautious, and skip the instruction.
1973  		 */
1974  		if (kvm_vcpu_dabt_is_cm(vcpu)) {
1975  			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1976  			ret = 1;
1977  			goto out_unlock;
1978  		}
1979  
1980  		/*
1981  		 * The IPA is reported as [MAX:12], so we need to
1982  		 * complement it with the bottom 12 bits from the
1983  		 * faulting VA. This is always 12 bits, irrespective
1984  		 * of the page size.
1985  		 */
1986  		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1987  		ret = io_mem_abort(vcpu, run, fault_ipa);
1988  		goto out_unlock;
1989  	}
1990  
1991  	/* Userspace should not be able to register out-of-bounds IPAs */
1992  	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
1993  
1994  	if (fault_status == FSC_ACCESS) {
1995  		handle_access_fault(vcpu, fault_ipa);
1996  		ret = 1;
1997  		goto out_unlock;
1998  	}
1999  
2000  	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
2001  	if (ret == 0)
2002  		ret = 1;
2003  out:
2004  	if (ret == -ENOEXEC) {
2005  		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
2006  		ret = 1;
2007  	}
2008  out_unlock:
2009  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
2010  	return ret;
2011  }
2012  
handle_hva_to_gpa(struct kvm * kvm,unsigned long start,unsigned long end,int (* handler)(struct kvm * kvm,gpa_t gpa,u64 size,void * data),void * data)2013  static int handle_hva_to_gpa(struct kvm *kvm,
2014  			     unsigned long start,
2015  			     unsigned long end,
2016  			     int (*handler)(struct kvm *kvm,
2017  					    gpa_t gpa, u64 size,
2018  					    void *data),
2019  			     void *data)
2020  {
2021  	struct kvm_memslots *slots;
2022  	struct kvm_memory_slot *memslot;
2023  	int ret = 0;
2024  
2025  	slots = kvm_memslots(kvm);
2026  
2027  	/* we only care about the pages that the guest sees */
2028  	kvm_for_each_memslot(memslot, slots) {
2029  		unsigned long hva_start, hva_end;
2030  		gfn_t gpa;
2031  
2032  		hva_start = max(start, memslot->userspace_addr);
2033  		hva_end = min(end, memslot->userspace_addr +
2034  					(memslot->npages << PAGE_SHIFT));
2035  		if (hva_start >= hva_end)
2036  			continue;
2037  
2038  		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
2039  		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
2040  	}
2041  
2042  	return ret;
2043  }
2044  
kvm_unmap_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2045  static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2046  {
2047  	unmap_stage2_range(kvm, gpa, size);
2048  	return 0;
2049  }
2050  
kvm_unmap_hva_range(struct kvm * kvm,unsigned long start,unsigned long end)2051  int kvm_unmap_hva_range(struct kvm *kvm,
2052  			unsigned long start, unsigned long end)
2053  {
2054  	if (!kvm->arch.pgd)
2055  		return 0;
2056  
2057  	trace_kvm_unmap_hva_range(start, end);
2058  	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
2059  	return 0;
2060  }
2061  
kvm_set_spte_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2062  static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2063  {
2064  	pte_t *pte = (pte_t *)data;
2065  
2066  	WARN_ON(size != PAGE_SIZE);
2067  	/*
2068  	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
2069  	 * flag clear because MMU notifiers will have unmapped a huge PMD before
2070  	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
2071  	 * therefore stage2_set_pte() never needs to clear out a huge PMD
2072  	 * through this calling path.
2073  	 */
2074  	stage2_set_pte(kvm, NULL, gpa, pte, 0);
2075  	return 0;
2076  }
2077  
2078  
kvm_set_spte_hva(struct kvm * kvm,unsigned long hva,pte_t pte)2079  int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
2080  {
2081  	unsigned long end = hva + PAGE_SIZE;
2082  	kvm_pfn_t pfn = pte_pfn(pte);
2083  	pte_t stage2_pte;
2084  
2085  	if (!kvm->arch.pgd)
2086  		return 0;
2087  
2088  	trace_kvm_set_spte_hva(hva);
2089  
2090  	/*
2091  	 * We've moved a page around, probably through CoW, so let's treat it
2092  	 * just like a translation fault and clean the cache to the PoC.
2093  	 */
2094  	clean_dcache_guest_page(pfn, PAGE_SIZE);
2095  	stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
2096  	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
2097  
2098  	return 0;
2099  }
2100  
kvm_age_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2101  static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2102  {
2103  	pud_t *pud;
2104  	pmd_t *pmd;
2105  	pte_t *pte;
2106  
2107  	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2108  	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
2109  		return 0;
2110  
2111  	if (pud)
2112  		return stage2_pudp_test_and_clear_young(pud);
2113  	else if (pmd)
2114  		return stage2_pmdp_test_and_clear_young(pmd);
2115  	else
2116  		return stage2_ptep_test_and_clear_young(pte);
2117  }
2118  
kvm_test_age_hva_handler(struct kvm * kvm,gpa_t gpa,u64 size,void * data)2119  static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
2120  {
2121  	pud_t *pud;
2122  	pmd_t *pmd;
2123  	pte_t *pte;
2124  
2125  	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
2126  	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
2127  		return 0;
2128  
2129  	if (pud)
2130  		return kvm_s2pud_young(*pud);
2131  	else if (pmd)
2132  		return pmd_young(*pmd);
2133  	else
2134  		return pte_young(*pte);
2135  }
2136  
kvm_age_hva(struct kvm * kvm,unsigned long start,unsigned long end)2137  int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2138  {
2139  	if (!kvm->arch.pgd)
2140  		return 0;
2141  	trace_kvm_age_hva(start, end);
2142  	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
2143  }
2144  
kvm_test_age_hva(struct kvm * kvm,unsigned long hva)2145  int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2146  {
2147  	if (!kvm->arch.pgd)
2148  		return 0;
2149  	trace_kvm_test_age_hva(hva);
2150  	return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
2151  }
2152  
kvm_mmu_free_memory_caches(struct kvm_vcpu * vcpu)2153  void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
2154  {
2155  	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
2156  }
2157  
kvm_mmu_get_httbr(void)2158  phys_addr_t kvm_mmu_get_httbr(void)
2159  {
2160  	if (__kvm_cpu_uses_extended_idmap())
2161  		return virt_to_phys(merged_hyp_pgd);
2162  	else
2163  		return virt_to_phys(hyp_pgd);
2164  }
2165  
kvm_get_idmap_vector(void)2166  phys_addr_t kvm_get_idmap_vector(void)
2167  {
2168  	return hyp_idmap_vector;
2169  }
2170  
kvm_map_idmap_text(pgd_t * pgd)2171  static int kvm_map_idmap_text(pgd_t *pgd)
2172  {
2173  	int err;
2174  
2175  	/* Create the idmap in the boot page tables */
2176  	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
2177  				      hyp_idmap_start, hyp_idmap_end,
2178  				      __phys_to_pfn(hyp_idmap_start),
2179  				      PAGE_HYP_EXEC);
2180  	if (err)
2181  		kvm_err("Failed to idmap %lx-%lx\n",
2182  			hyp_idmap_start, hyp_idmap_end);
2183  
2184  	return err;
2185  }
2186  
kvm_mmu_init(void)2187  int kvm_mmu_init(void)
2188  {
2189  	int err;
2190  
2191  	hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
2192  	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2193  	hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
2194  	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2195  	hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
2196  
2197  	/*
2198  	 * We rely on the linker script to ensure at build time that the HYP
2199  	 * init code does not cross a page boundary.
2200  	 */
2201  	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2202  
2203  	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2204  	kvm_debug("HYP VA range: %lx:%lx\n",
2205  		  kern_hyp_va(PAGE_OFFSET),
2206  		  kern_hyp_va((unsigned long)high_memory - 1));
2207  
2208  	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2209  	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
2210  	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2211  		/*
2212  		 * The idmap page is intersecting with the VA space,
2213  		 * it is not safe to continue further.
2214  		 */
2215  		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2216  		err = -EINVAL;
2217  		goto out;
2218  	}
2219  
2220  	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
2221  	if (!hyp_pgd) {
2222  		kvm_err("Hyp mode PGD not allocated\n");
2223  		err = -ENOMEM;
2224  		goto out;
2225  	}
2226  
2227  	if (__kvm_cpu_uses_extended_idmap()) {
2228  		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
2229  							 hyp_pgd_order);
2230  		if (!boot_hyp_pgd) {
2231  			kvm_err("Hyp boot PGD not allocated\n");
2232  			err = -ENOMEM;
2233  			goto out;
2234  		}
2235  
2236  		err = kvm_map_idmap_text(boot_hyp_pgd);
2237  		if (err)
2238  			goto out;
2239  
2240  		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
2241  		if (!merged_hyp_pgd) {
2242  			kvm_err("Failed to allocate extra HYP pgd\n");
2243  			goto out;
2244  		}
2245  		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
2246  				    hyp_idmap_start);
2247  	} else {
2248  		err = kvm_map_idmap_text(hyp_pgd);
2249  		if (err)
2250  			goto out;
2251  	}
2252  
2253  	io_map_base = hyp_idmap_start;
2254  	return 0;
2255  out:
2256  	free_hyp_pgds();
2257  	return err;
2258  }
2259  
kvm_arch_commit_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region * mem,const struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)2260  void kvm_arch_commit_memory_region(struct kvm *kvm,
2261  				   const struct kvm_userspace_memory_region *mem,
2262  				   const struct kvm_memory_slot *old,
2263  				   const struct kvm_memory_slot *new,
2264  				   enum kvm_mr_change change)
2265  {
2266  	/*
2267  	 * At this point memslot has been committed and there is an
2268  	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
2269  	 * memory slot is write protected.
2270  	 */
2271  	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
2272  		kvm_mmu_wp_memory_region(kvm, mem->slot);
2273  }
2274  
kvm_arch_prepare_memory_region(struct kvm * kvm,struct kvm_memory_slot * memslot,const struct kvm_userspace_memory_region * mem,enum kvm_mr_change change)2275  int kvm_arch_prepare_memory_region(struct kvm *kvm,
2276  				   struct kvm_memory_slot *memslot,
2277  				   const struct kvm_userspace_memory_region *mem,
2278  				   enum kvm_mr_change change)
2279  {
2280  	hva_t hva = mem->userspace_addr;
2281  	hva_t reg_end = hva + mem->memory_size;
2282  	bool writable = !(mem->flags & KVM_MEM_READONLY);
2283  	int ret = 0;
2284  
2285  	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2286  			change != KVM_MR_FLAGS_ONLY)
2287  		return 0;
2288  
2289  	/*
2290  	 * Prevent userspace from creating a memory region outside of the IPA
2291  	 * space addressable by the KVM guest IPA space.
2292  	 */
2293  	if (memslot->base_gfn + memslot->npages >=
2294  	    (kvm_phys_size(kvm) >> PAGE_SHIFT))
2295  		return -EFAULT;
2296  
2297  	down_read(&current->mm->mmap_sem);
2298  	/*
2299  	 * A memory region could potentially cover multiple VMAs, and any holes
2300  	 * between them, so iterate over all of them to find out if we can map
2301  	 * any of them right now.
2302  	 *
2303  	 *     +--------------------------------------------+
2304  	 * +---------------+----------------+   +----------------+
2305  	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2306  	 * +---------------+----------------+   +----------------+
2307  	 *     |               memory region                |
2308  	 *     +--------------------------------------------+
2309  	 */
2310  	do {
2311  		struct vm_area_struct *vma = find_vma(current->mm, hva);
2312  		hva_t vm_start, vm_end;
2313  
2314  		if (!vma || vma->vm_start >= reg_end)
2315  			break;
2316  
2317  		/*
2318  		 * Mapping a read-only VMA is only allowed if the
2319  		 * memory region is configured as read-only.
2320  		 */
2321  		if (writable && !(vma->vm_flags & VM_WRITE)) {
2322  			ret = -EPERM;
2323  			break;
2324  		}
2325  
2326  		/*
2327  		 * Take the intersection of this VMA with the memory region
2328  		 */
2329  		vm_start = max(hva, vma->vm_start);
2330  		vm_end = min(reg_end, vma->vm_end);
2331  
2332  		if (vma->vm_flags & VM_PFNMAP) {
2333  			gpa_t gpa = mem->guest_phys_addr +
2334  				    (vm_start - mem->userspace_addr);
2335  			phys_addr_t pa;
2336  
2337  			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2338  			pa += vm_start - vma->vm_start;
2339  
2340  			/* IO region dirty page logging not allowed */
2341  			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2342  				ret = -EINVAL;
2343  				goto out;
2344  			}
2345  
2346  			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2347  						    vm_end - vm_start,
2348  						    writable);
2349  			if (ret)
2350  				break;
2351  		}
2352  		hva = vm_end;
2353  	} while (hva < reg_end);
2354  
2355  	if (change == KVM_MR_FLAGS_ONLY)
2356  		goto out;
2357  
2358  	spin_lock(&kvm->mmu_lock);
2359  	if (ret)
2360  		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
2361  	else
2362  		stage2_flush_memslot(kvm, memslot);
2363  	spin_unlock(&kvm->mmu_lock);
2364  out:
2365  	up_read(&current->mm->mmap_sem);
2366  	return ret;
2367  }
2368  
kvm_arch_free_memslot(struct kvm * kvm,struct kvm_memory_slot * free,struct kvm_memory_slot * dont)2369  void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
2370  			   struct kvm_memory_slot *dont)
2371  {
2372  }
2373  
kvm_arch_create_memslot(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned long npages)2374  int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
2375  			    unsigned long npages)
2376  {
2377  	return 0;
2378  }
2379  
kvm_arch_memslots_updated(struct kvm * kvm,u64 gen)2380  void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2381  {
2382  }
2383  
kvm_arch_flush_shadow_all(struct kvm * kvm)2384  void kvm_arch_flush_shadow_all(struct kvm *kvm)
2385  {
2386  	kvm_free_stage2_pgd(kvm);
2387  }
2388  
kvm_arch_flush_shadow_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)2389  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2390  				   struct kvm_memory_slot *slot)
2391  {
2392  	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2393  	phys_addr_t size = slot->npages << PAGE_SHIFT;
2394  
2395  	spin_lock(&kvm->mmu_lock);
2396  	unmap_stage2_range(kvm, gpa, size);
2397  	spin_unlock(&kvm->mmu_lock);
2398  }
2399  
2400  /*
2401   * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2402   *
2403   * Main problems:
2404   * - S/W ops are local to a CPU (not broadcast)
2405   * - We have line migration behind our back (speculation)
2406   * - System caches don't support S/W at all (damn!)
2407   *
2408   * In the face of the above, the best we can do is to try and convert
2409   * S/W ops to VA ops. Because the guest is not allowed to infer the
2410   * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2411   * which is a rather good thing for us.
2412   *
2413   * Also, it is only used when turning caches on/off ("The expected
2414   * usage of the cache maintenance instructions that operate by set/way
2415   * is associated with the cache maintenance instructions associated
2416   * with the powerdown and powerup of caches, if this is required by
2417   * the implementation.").
2418   *
2419   * We use the following policy:
2420   *
2421   * - If we trap a S/W operation, we enable VM trapping to detect
2422   *   caches being turned on/off, and do a full clean.
2423   *
2424   * - We flush the caches on both caches being turned on and off.
2425   *
2426   * - Once the caches are enabled, we stop trapping VM ops.
2427   */
kvm_set_way_flush(struct kvm_vcpu * vcpu)2428  void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2429  {
2430  	unsigned long hcr = *vcpu_hcr(vcpu);
2431  
2432  	/*
2433  	 * If this is the first time we do a S/W operation
2434  	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2435  	 * VM trapping.
2436  	 *
2437  	 * Otherwise, rely on the VM trapping to wait for the MMU +
2438  	 * Caches to be turned off. At that point, we'll be able to
2439  	 * clean the caches again.
2440  	 */
2441  	if (!(hcr & HCR_TVM)) {
2442  		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2443  					vcpu_has_cache_enabled(vcpu));
2444  		stage2_flush_vm(vcpu->kvm);
2445  		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
2446  	}
2447  }
2448  
kvm_toggle_cache(struct kvm_vcpu * vcpu,bool was_enabled)2449  void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2450  {
2451  	bool now_enabled = vcpu_has_cache_enabled(vcpu);
2452  
2453  	/*
2454  	 * If switching the MMU+caches on, need to invalidate the caches.
2455  	 * If switching it off, need to clean the caches.
2456  	 * Clean + invalidate does the trick always.
2457  	 */
2458  	if (now_enabled != was_enabled)
2459  		stage2_flush_vm(vcpu->kvm);
2460  
2461  	/* Caches are now on, stop trapping VM ops (until a S/W op) */
2462  	if (now_enabled)
2463  		*vcpu_hcr(vcpu) &= ~HCR_TVM;
2464  
2465  	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2466  }
2467