• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License, version 2, as
4  * published by the Free Software Foundation.
5  *
6  * This program is distributed in the hope that it will be useful,
7  * but WITHOUT ANY WARRANTY; without even the implied warranty of
8  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
9  * GNU General Public License for more details.
10  *
11  * You should have received a copy of the GNU General Public License
12  * along with this program; if not, write to the Free Software
13  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
14  *
15  * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
16  */
17 
18 #include <linux/types.h>
19 #include <linux/string.h>
20 #include <linux/kvm.h>
21 #include <linux/kvm_host.h>
22 #include <linux/highmem.h>
23 #include <linux/gfp.h>
24 #include <linux/slab.h>
25 #include <linux/hugetlb.h>
26 #include <linux/vmalloc.h>
27 #include <linux/srcu.h>
28 #include <linux/anon_inodes.h>
29 #include <linux/file.h>
30 #include <linux/debugfs.h>
31 
32 #include <asm/tlbflush.h>
33 #include <asm/kvm_ppc.h>
34 #include <asm/kvm_book3s.h>
35 #include <asm/book3s/64/mmu-hash.h>
36 #include <asm/hvcall.h>
37 #include <asm/synch.h>
38 #include <asm/ppc-opcode.h>
39 #include <asm/cputable.h>
40 #include <asm/pte-walk.h>
41 
42 #include "trace_hv.h"
43 
44 //#define DEBUG_RESIZE_HPT	1
45 
46 #ifdef DEBUG_RESIZE_HPT
47 #define resize_hpt_debug(resize, ...)				\
48 	do {							\
49 		printk(KERN_DEBUG "RESIZE HPT %p: ", resize);	\
50 		printk(__VA_ARGS__);				\
51 	} while (0)
52 #else
53 #define resize_hpt_debug(resize, ...)				\
54 	do { } while (0)
55 #endif
56 
57 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
58 				long pte_index, unsigned long pteh,
59 				unsigned long ptel, unsigned long *pte_idx_ret);
60 
61 struct kvm_resize_hpt {
62 	/* These fields read-only after init */
63 	struct kvm *kvm;
64 	struct work_struct work;
65 	u32 order;
66 
67 	/* These fields protected by kvm->lock */
68 
69 	/* Possible values and their usage:
70 	 *  <0     an error occurred during allocation,
71 	 *  -EBUSY allocation is in the progress,
72 	 *  0      allocation made successfuly.
73 	 */
74 	int error;
75 
76 	/* Private to the work thread, until error != -EBUSY,
77 	 * then protected by kvm->lock.
78 	 */
79 	struct kvm_hpt_info hpt;
80 };
81 
82 static void kvmppc_rmap_reset(struct kvm *kvm);
83 
kvmppc_allocate_hpt(struct kvm_hpt_info * info,u32 order)84 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
85 {
86 	unsigned long hpt = 0;
87 	int cma = 0;
88 	struct page *page = NULL;
89 	struct revmap_entry *rev;
90 	unsigned long npte;
91 
92 	if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
93 		return -EINVAL;
94 
95 	page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
96 	if (page) {
97 		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
98 		memset((void *)hpt, 0, (1ul << order));
99 		cma = 1;
100 	}
101 
102 	if (!hpt)
103 		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL
104 				       |__GFP_NOWARN, order - PAGE_SHIFT);
105 
106 	if (!hpt)
107 		return -ENOMEM;
108 
109 	/* HPTEs are 2**4 bytes long */
110 	npte = 1ul << (order - 4);
111 
112 	/* Allocate reverse map array */
113 	rev = vmalloc(sizeof(struct revmap_entry) * npte);
114 	if (!rev) {
115 		pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
116 		if (cma)
117 			kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
118 		else
119 			free_pages(hpt, order - PAGE_SHIFT);
120 		return -ENOMEM;
121 	}
122 
123 	info->order = order;
124 	info->virt = hpt;
125 	info->cma = cma;
126 	info->rev = rev;
127 
128 	return 0;
129 }
130 
kvmppc_set_hpt(struct kvm * kvm,struct kvm_hpt_info * info)131 void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
132 {
133 	atomic64_set(&kvm->arch.mmio_update, 0);
134 	kvm->arch.hpt = *info;
135 	kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18);
136 
137 	pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n",
138 		 info->virt, (long)info->order, kvm->arch.lpid);
139 }
140 
kvmppc_alloc_reset_hpt(struct kvm * kvm,int order)141 long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
142 {
143 	long err = -EBUSY;
144 	struct kvm_hpt_info info;
145 
146 	if (kvm_is_radix(kvm))
147 		return -EINVAL;
148 
149 	mutex_lock(&kvm->lock);
150 	if (kvm->arch.hpte_setup_done) {
151 		kvm->arch.hpte_setup_done = 0;
152 		/* order hpte_setup_done vs. vcpus_running */
153 		smp_mb();
154 		if (atomic_read(&kvm->arch.vcpus_running)) {
155 			kvm->arch.hpte_setup_done = 1;
156 			goto out;
157 		}
158 	}
159 	if (kvm->arch.hpt.order == order) {
160 		/* We already have a suitable HPT */
161 
162 		/* Set the entire HPT to 0, i.e. invalid HPTEs */
163 		memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
164 		/*
165 		 * Reset all the reverse-mapping chains for all memslots
166 		 */
167 		kvmppc_rmap_reset(kvm);
168 		err = 0;
169 		goto out;
170 	}
171 
172 	if (kvm->arch.hpt.virt) {
173 		kvmppc_free_hpt(&kvm->arch.hpt);
174 		kvmppc_rmap_reset(kvm);
175 	}
176 
177 	err = kvmppc_allocate_hpt(&info, order);
178 	if (err < 0)
179 		goto out;
180 	kvmppc_set_hpt(kvm, &info);
181 
182 out:
183 	if (err == 0)
184 		/* Ensure that each vcpu will flush its TLB on next entry. */
185 		cpumask_setall(&kvm->arch.need_tlb_flush);
186 
187 	mutex_unlock(&kvm->lock);
188 	return err;
189 }
190 
kvmppc_free_hpt(struct kvm_hpt_info * info)191 void kvmppc_free_hpt(struct kvm_hpt_info *info)
192 {
193 	vfree(info->rev);
194 	if (info->cma)
195 		kvm_free_hpt_cma(virt_to_page(info->virt),
196 				 1 << (info->order - PAGE_SHIFT));
197 	else if (info->virt)
198 		free_pages(info->virt, info->order - PAGE_SHIFT);
199 	info->virt = 0;
200 	info->order = 0;
201 }
202 
203 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
hpte0_pgsize_encoding(unsigned long pgsize)204 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
205 {
206 	return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
207 }
208 
209 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
hpte1_pgsize_encoding(unsigned long pgsize)210 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
211 {
212 	return (pgsize == 0x10000) ? 0x1000 : 0;
213 }
214 
kvmppc_map_vrma(struct kvm_vcpu * vcpu,struct kvm_memory_slot * memslot,unsigned long porder)215 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
216 		     unsigned long porder)
217 {
218 	unsigned long i;
219 	unsigned long npages;
220 	unsigned long hp_v, hp_r;
221 	unsigned long addr, hash;
222 	unsigned long psize;
223 	unsigned long hp0, hp1;
224 	unsigned long idx_ret;
225 	long ret;
226 	struct kvm *kvm = vcpu->kvm;
227 
228 	psize = 1ul << porder;
229 	npages = memslot->npages >> (porder - PAGE_SHIFT);
230 
231 	/* VRMA can't be > 1TB */
232 	if (npages > 1ul << (40 - porder))
233 		npages = 1ul << (40 - porder);
234 	/* Can't use more than 1 HPTE per HPTEG */
235 	if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1)
236 		npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1;
237 
238 	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
239 		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
240 	hp1 = hpte1_pgsize_encoding(psize) |
241 		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
242 
243 	for (i = 0; i < npages; ++i) {
244 		addr = i << porder;
245 		/* can't use hpt_hash since va > 64 bits */
246 		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25)))
247 			& kvmppc_hpt_mask(&kvm->arch.hpt);
248 		/*
249 		 * We assume that the hash table is empty and no
250 		 * vcpus are using it at this stage.  Since we create
251 		 * at most one HPTE per HPTEG, we just assume entry 7
252 		 * is available and use it.
253 		 */
254 		hash = (hash << 3) + 7;
255 		hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
256 		hp_r = hp1 | addr;
257 		ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
258 						 &idx_ret);
259 		if (ret != H_SUCCESS) {
260 			pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
261 			       addr, ret);
262 			break;
263 		}
264 	}
265 }
266 
kvmppc_mmu_hv_init(void)267 int kvmppc_mmu_hv_init(void)
268 {
269 	unsigned long host_lpid, rsvd_lpid;
270 
271 	if (!cpu_has_feature(CPU_FTR_HVMODE))
272 		return -EINVAL;
273 
274 	/* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
275 	host_lpid = mfspr(SPRN_LPID);
276 	rsvd_lpid = LPID_RSVD;
277 
278 	kvmppc_init_lpid(rsvd_lpid + 1);
279 
280 	kvmppc_claim_lpid(host_lpid);
281 	/* rsvd_lpid is reserved for use in partition switching */
282 	kvmppc_claim_lpid(rsvd_lpid);
283 
284 	return 0;
285 }
286 
kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu * vcpu)287 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
288 {
289 	unsigned long msr = vcpu->arch.intr_msr;
290 
291 	/* If transactional, change to suspend mode on IRQ delivery */
292 	if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
293 		msr |= MSR_TS_S;
294 	else
295 		msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
296 	kvmppc_set_msr(vcpu, msr);
297 }
298 
kvmppc_virtmode_do_h_enter(struct kvm * kvm,unsigned long flags,long pte_index,unsigned long pteh,unsigned long ptel,unsigned long * pte_idx_ret)299 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
300 				long pte_index, unsigned long pteh,
301 				unsigned long ptel, unsigned long *pte_idx_ret)
302 {
303 	long ret;
304 
305 	/* Protect linux PTE lookup from page table destruction */
306 	rcu_read_lock_sched();	/* this disables preemption too */
307 	ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
308 				current->mm->pgd, false, pte_idx_ret);
309 	rcu_read_unlock_sched();
310 	if (ret == H_TOO_HARD) {
311 		/* this can't happen */
312 		pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
313 		ret = H_RESOURCE;	/* or something */
314 	}
315 	return ret;
316 
317 }
318 
kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu * vcpu,gva_t eaddr)319 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
320 							 gva_t eaddr)
321 {
322 	u64 mask;
323 	int i;
324 
325 	for (i = 0; i < vcpu->arch.slb_nr; i++) {
326 		if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
327 			continue;
328 
329 		if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
330 			mask = ESID_MASK_1T;
331 		else
332 			mask = ESID_MASK;
333 
334 		if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
335 			return &vcpu->arch.slb[i];
336 	}
337 	return NULL;
338 }
339 
kvmppc_mmu_get_real_addr(unsigned long v,unsigned long r,unsigned long ea)340 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
341 			unsigned long ea)
342 {
343 	unsigned long ra_mask;
344 
345 	ra_mask = hpte_page_size(v, r) - 1;
346 	return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
347 }
348 
kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu * vcpu,gva_t eaddr,struct kvmppc_pte * gpte,bool data,bool iswrite)349 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
350 			struct kvmppc_pte *gpte, bool data, bool iswrite)
351 {
352 	struct kvm *kvm = vcpu->kvm;
353 	struct kvmppc_slb *slbe;
354 	unsigned long slb_v;
355 	unsigned long pp, key;
356 	unsigned long v, orig_v, gr;
357 	__be64 *hptep;
358 	long int index;
359 	int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
360 
361 	/* Get SLB entry */
362 	if (virtmode) {
363 		slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
364 		if (!slbe)
365 			return -EINVAL;
366 		slb_v = slbe->origv;
367 	} else {
368 		/* real mode access */
369 		slb_v = vcpu->kvm->arch.vrma_slb_v;
370 	}
371 
372 	preempt_disable();
373 	/* Find the HPTE in the hash table */
374 	index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
375 					 HPTE_V_VALID | HPTE_V_ABSENT);
376 	if (index < 0) {
377 		preempt_enable();
378 		return -ENOENT;
379 	}
380 	hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
381 	v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
382 	if (cpu_has_feature(CPU_FTR_ARCH_300))
383 		v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
384 	gr = kvm->arch.hpt.rev[index].guest_rpte;
385 
386 	unlock_hpte(hptep, orig_v);
387 	preempt_enable();
388 
389 	gpte->eaddr = eaddr;
390 	gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
391 
392 	/* Get PP bits and key for permission check */
393 	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
394 	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
395 	key &= slb_v;
396 
397 	/* Calculate permissions */
398 	gpte->may_read = hpte_read_permission(pp, key);
399 	gpte->may_write = hpte_write_permission(pp, key);
400 	gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
401 
402 	/* Storage key permission check for POWER7 */
403 	if (data && virtmode) {
404 		int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
405 		if (amrfield & 1)
406 			gpte->may_read = 0;
407 		if (amrfield & 2)
408 			gpte->may_write = 0;
409 	}
410 
411 	/* Get the guest physical address */
412 	gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
413 	return 0;
414 }
415 
416 /*
417  * Quick test for whether an instruction is a load or a store.
418  * If the instruction is a load or a store, then this will indicate
419  * which it is, at least on server processors.  (Embedded processors
420  * have some external PID instructions that don't follow the rule
421  * embodied here.)  If the instruction isn't a load or store, then
422  * this doesn't return anything useful.
423  */
instruction_is_store(unsigned int instr)424 static int instruction_is_store(unsigned int instr)
425 {
426 	unsigned int mask;
427 
428 	mask = 0x10000000;
429 	if ((instr & 0xfc000000) == 0x7c000000)
430 		mask = 0x100;		/* major opcode 31 */
431 	return (instr & mask) != 0;
432 }
433 
kvmppc_hv_emulate_mmio(struct kvm_run * run,struct kvm_vcpu * vcpu,unsigned long gpa,gva_t ea,int is_store)434 int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
435 			   unsigned long gpa, gva_t ea, int is_store)
436 {
437 	u32 last_inst;
438 
439 	/*
440 	 * If we fail, we just return to the guest and try executing it again.
441 	 */
442 	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
443 		EMULATE_DONE)
444 		return RESUME_GUEST;
445 
446 	/*
447 	 * WARNING: We do not know for sure whether the instruction we just
448 	 * read from memory is the same that caused the fault in the first
449 	 * place.  If the instruction we read is neither an load or a store,
450 	 * then it can't access memory, so we don't need to worry about
451 	 * enforcing access permissions.  So, assuming it is a load or
452 	 * store, we just check that its direction (load or store) is
453 	 * consistent with the original fault, since that's what we
454 	 * checked the access permissions against.  If there is a mismatch
455 	 * we just return and retry the instruction.
456 	 */
457 
458 	if (instruction_is_store(last_inst) != !!is_store)
459 		return RESUME_GUEST;
460 
461 	/*
462 	 * Emulated accesses are emulated by looking at the hash for
463 	 * translation once, then performing the access later. The
464 	 * translation could be invalidated in the meantime in which
465 	 * point performing the subsequent memory access on the old
466 	 * physical address could possibly be a security hole for the
467 	 * guest (but not the host).
468 	 *
469 	 * This is less of an issue for MMIO stores since they aren't
470 	 * globally visible. It could be an issue for MMIO loads to
471 	 * a certain extent but we'll ignore it for now.
472 	 */
473 
474 	vcpu->arch.paddr_accessed = gpa;
475 	vcpu->arch.vaddr_accessed = ea;
476 	return kvmppc_emulate_mmio(run, vcpu);
477 }
478 
kvmppc_book3s_hv_page_fault(struct kvm_run * run,struct kvm_vcpu * vcpu,unsigned long ea,unsigned long dsisr)479 int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
480 				unsigned long ea, unsigned long dsisr)
481 {
482 	struct kvm *kvm = vcpu->kvm;
483 	unsigned long hpte[3], r;
484 	unsigned long hnow_v, hnow_r;
485 	__be64 *hptep;
486 	unsigned long mmu_seq, psize, pte_size;
487 	unsigned long gpa_base, gfn_base;
488 	unsigned long gpa, gfn, hva, pfn;
489 	struct kvm_memory_slot *memslot;
490 	unsigned long *rmap;
491 	struct revmap_entry *rev;
492 	struct page *page, *pages[1];
493 	long index, ret, npages;
494 	bool is_ci;
495 	unsigned int writing, write_ok;
496 	struct vm_area_struct *vma;
497 	unsigned long rcbits;
498 	long mmio_update;
499 
500 	if (kvm_is_radix(kvm))
501 		return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
502 
503 	/*
504 	 * Real-mode code has already searched the HPT and found the
505 	 * entry we're interested in.  Lock the entry and check that
506 	 * it hasn't changed.  If it has, just return and re-execute the
507 	 * instruction.
508 	 */
509 	if (ea != vcpu->arch.pgfault_addr)
510 		return RESUME_GUEST;
511 
512 	if (vcpu->arch.pgfault_cache) {
513 		mmio_update = atomic64_read(&kvm->arch.mmio_update);
514 		if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
515 			r = vcpu->arch.pgfault_cache->rpte;
516 			psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r);
517 			gpa_base = r & HPTE_R_RPN & ~(psize - 1);
518 			gfn_base = gpa_base >> PAGE_SHIFT;
519 			gpa = gpa_base | (ea & (psize - 1));
520 			return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
521 						dsisr & DSISR_ISSTORE);
522 		}
523 	}
524 	index = vcpu->arch.pgfault_index;
525 	hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
526 	rev = &kvm->arch.hpt.rev[index];
527 	preempt_disable();
528 	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
529 		cpu_relax();
530 	hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
531 	hpte[1] = be64_to_cpu(hptep[1]);
532 	hpte[2] = r = rev->guest_rpte;
533 	unlock_hpte(hptep, hpte[0]);
534 	preempt_enable();
535 
536 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
537 		hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
538 		hpte[1] = hpte_new_to_old_r(hpte[1]);
539 	}
540 	if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
541 	    hpte[1] != vcpu->arch.pgfault_hpte[1])
542 		return RESUME_GUEST;
543 
544 	/* Translate the logical address and get the page */
545 	psize = hpte_page_size(hpte[0], r);
546 	gpa_base = r & HPTE_R_RPN & ~(psize - 1);
547 	gfn_base = gpa_base >> PAGE_SHIFT;
548 	gpa = gpa_base | (ea & (psize - 1));
549 	gfn = gpa >> PAGE_SHIFT;
550 	memslot = gfn_to_memslot(kvm, gfn);
551 
552 	trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr);
553 
554 	/* No memslot means it's an emulated MMIO region */
555 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
556 		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
557 					      dsisr & DSISR_ISSTORE);
558 
559 	/*
560 	 * This should never happen, because of the slot_is_aligned()
561 	 * check in kvmppc_do_h_enter().
562 	 */
563 	if (gfn_base < memslot->base_gfn)
564 		return -EFAULT;
565 
566 	/* used to check for invalidations in progress */
567 	mmu_seq = kvm->mmu_notifier_seq;
568 	smp_rmb();
569 
570 	ret = -EFAULT;
571 	is_ci = false;
572 	pfn = 0;
573 	page = NULL;
574 	pte_size = PAGE_SIZE;
575 	writing = (dsisr & DSISR_ISSTORE) != 0;
576 	/* If writing != 0, then the HPTE must allow writing, if we get here */
577 	write_ok = writing;
578 	hva = gfn_to_hva_memslot(memslot, gfn);
579 	npages = get_user_pages_fast(hva, 1, writing, pages);
580 	if (npages < 1) {
581 		/* Check if it's an I/O mapping */
582 		down_read(&current->mm->mmap_sem);
583 		vma = find_vma(current->mm, hva);
584 		if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
585 		    (vma->vm_flags & VM_PFNMAP)) {
586 			pfn = vma->vm_pgoff +
587 				((hva - vma->vm_start) >> PAGE_SHIFT);
588 			pte_size = psize;
589 			is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot))));
590 			write_ok = vma->vm_flags & VM_WRITE;
591 		}
592 		up_read(&current->mm->mmap_sem);
593 		if (!pfn)
594 			goto out_put;
595 	} else {
596 		page = pages[0];
597 		pfn = page_to_pfn(page);
598 		if (PageHuge(page)) {
599 			page = compound_head(page);
600 			pte_size <<= compound_order(page);
601 		}
602 		/* if the guest wants write access, see if that is OK */
603 		if (!writing && hpte_is_writable(r)) {
604 			pte_t *ptep, pte;
605 			unsigned long flags;
606 			/*
607 			 * We need to protect against page table destruction
608 			 * hugepage split and collapse.
609 			 */
610 			local_irq_save(flags);
611 			ptep = find_current_mm_pte(current->mm->pgd,
612 						   hva, NULL, NULL);
613 			if (ptep) {
614 				pte = kvmppc_read_update_linux_pte(ptep, 1);
615 				if (__pte_write(pte))
616 					write_ok = 1;
617 			}
618 			local_irq_restore(flags);
619 		}
620 	}
621 
622 	if (psize > pte_size)
623 		goto out_put;
624 
625 	/* Check WIMG vs. the actual page we're accessing */
626 	if (!hpte_cache_flags_ok(r, is_ci)) {
627 		if (is_ci)
628 			goto out_put;
629 		/*
630 		 * Allow guest to map emulated device memory as
631 		 * uncacheable, but actually make it cacheable.
632 		 */
633 		r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
634 	}
635 
636 	/*
637 	 * Set the HPTE to point to pfn.
638 	 * Since the pfn is at PAGE_SIZE granularity, make sure we
639 	 * don't mask out lower-order bits if psize < PAGE_SIZE.
640 	 */
641 	if (psize < PAGE_SIZE)
642 		psize = PAGE_SIZE;
643 	r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) |
644 					((pfn << PAGE_SHIFT) & ~(psize - 1));
645 	if (hpte_is_writable(r) && !write_ok)
646 		r = hpte_make_readonly(r);
647 	ret = RESUME_GUEST;
648 	preempt_disable();
649 	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
650 		cpu_relax();
651 	hnow_v = be64_to_cpu(hptep[0]);
652 	hnow_r = be64_to_cpu(hptep[1]);
653 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
654 		hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
655 		hnow_r = hpte_new_to_old_r(hnow_r);
656 	}
657 
658 	/*
659 	 * If the HPT is being resized, don't update the HPTE,
660 	 * instead let the guest retry after the resize operation is complete.
661 	 * The synchronization for hpte_setup_done test vs. set is provided
662 	 * by the HPTE lock.
663 	 */
664 	if (!kvm->arch.hpte_setup_done)
665 		goto out_unlock;
666 
667 	if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
668 	    rev->guest_rpte != hpte[2])
669 		/* HPTE has been changed under us; let the guest retry */
670 		goto out_unlock;
671 	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
672 
673 	/* Always put the HPTE in the rmap chain for the page base address */
674 	rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
675 	lock_rmap(rmap);
676 
677 	/* Check if we might have been invalidated; let the guest retry if so */
678 	ret = RESUME_GUEST;
679 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
680 		unlock_rmap(rmap);
681 		goto out_unlock;
682 	}
683 
684 	/* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
685 	rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
686 	r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
687 
688 	if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
689 		/* HPTE was previously valid, so we need to invalidate it */
690 		unlock_rmap(rmap);
691 		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
692 		kvmppc_invalidate_hpte(kvm, hptep, index);
693 		/* don't lose previous R and C bits */
694 		r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
695 	} else {
696 		kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
697 	}
698 
699 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
700 		r = hpte_old_to_new_r(hpte[0], r);
701 		hpte[0] = hpte_old_to_new_v(hpte[0]);
702 	}
703 	hptep[1] = cpu_to_be64(r);
704 	eieio();
705 	__unlock_hpte(hptep, hpte[0]);
706 	asm volatile("ptesync" : : : "memory");
707 	preempt_enable();
708 	if (page && hpte_is_writable(r))
709 		SetPageDirty(page);
710 
711  out_put:
712 	trace_kvm_page_fault_exit(vcpu, hpte, ret);
713 
714 	if (page) {
715 		/*
716 		 * We drop pages[0] here, not page because page might
717 		 * have been set to the head page of a compound, but
718 		 * we have to drop the reference on the correct tail
719 		 * page to match the get inside gup()
720 		 */
721 		put_page(pages[0]);
722 	}
723 	return ret;
724 
725  out_unlock:
726 	__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
727 	preempt_enable();
728 	goto out_put;
729 }
730 
kvmppc_rmap_reset(struct kvm * kvm)731 static void kvmppc_rmap_reset(struct kvm *kvm)
732 {
733 	struct kvm_memslots *slots;
734 	struct kvm_memory_slot *memslot;
735 	int srcu_idx;
736 
737 	srcu_idx = srcu_read_lock(&kvm->srcu);
738 	slots = kvm_memslots(kvm);
739 	kvm_for_each_memslot(memslot, slots) {
740 		/*
741 		 * This assumes it is acceptable to lose reference and
742 		 * change bits across a reset.
743 		 */
744 		memset(memslot->arch.rmap, 0,
745 		       memslot->npages * sizeof(*memslot->arch.rmap));
746 	}
747 	srcu_read_unlock(&kvm->srcu, srcu_idx);
748 }
749 
750 typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
751 			      unsigned long gfn);
752 
kvm_handle_hva_range(struct kvm * kvm,unsigned long start,unsigned long end,hva_handler_fn handler)753 static int kvm_handle_hva_range(struct kvm *kvm,
754 				unsigned long start,
755 				unsigned long end,
756 				hva_handler_fn handler)
757 {
758 	int ret;
759 	int retval = 0;
760 	struct kvm_memslots *slots;
761 	struct kvm_memory_slot *memslot;
762 
763 	slots = kvm_memslots(kvm);
764 	kvm_for_each_memslot(memslot, slots) {
765 		unsigned long hva_start, hva_end;
766 		gfn_t gfn, gfn_end;
767 
768 		hva_start = max(start, memslot->userspace_addr);
769 		hva_end = min(end, memslot->userspace_addr +
770 					(memslot->npages << PAGE_SHIFT));
771 		if (hva_start >= hva_end)
772 			continue;
773 		/*
774 		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
775 		 * {gfn, gfn+1, ..., gfn_end-1}.
776 		 */
777 		gfn = hva_to_gfn_memslot(hva_start, memslot);
778 		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
779 
780 		for (; gfn < gfn_end; ++gfn) {
781 			ret = handler(kvm, memslot, gfn);
782 			retval |= ret;
783 		}
784 	}
785 
786 	return retval;
787 }
788 
kvm_handle_hva(struct kvm * kvm,unsigned long hva,hva_handler_fn handler)789 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
790 			  hva_handler_fn handler)
791 {
792 	return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
793 }
794 
795 /* Must be called with both HPTE and rmap locked */
kvmppc_unmap_hpte(struct kvm * kvm,unsigned long i,unsigned long * rmapp,unsigned long gfn)796 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
797 			      unsigned long *rmapp, unsigned long gfn)
798 {
799 	__be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
800 	struct revmap_entry *rev = kvm->arch.hpt.rev;
801 	unsigned long j, h;
802 	unsigned long ptel, psize, rcbits;
803 
804 	j = rev[i].forw;
805 	if (j == i) {
806 		/* chain is now empty */
807 		*rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
808 	} else {
809 		/* remove i from chain */
810 		h = rev[i].back;
811 		rev[h].forw = j;
812 		rev[j].back = h;
813 		rev[i].forw = rev[i].back = i;
814 		*rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
815 	}
816 
817 	/* Now check and modify the HPTE */
818 	ptel = rev[i].guest_rpte;
819 	psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
820 	if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
821 	    hpte_rpn(ptel, psize) == gfn) {
822 		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
823 		kvmppc_invalidate_hpte(kvm, hptep, i);
824 		hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
825 		/* Harvest R and C */
826 		rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
827 		*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
828 		if (rcbits & HPTE_R_C)
829 			kvmppc_update_rmap_change(rmapp, psize);
830 		if (rcbits & ~rev[i].guest_rpte) {
831 			rev[i].guest_rpte = ptel | rcbits;
832 			note_hpte_modification(kvm, &rev[i]);
833 		}
834 	}
835 }
836 
kvm_unmap_rmapp(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long gfn)837 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
838 			   unsigned long gfn)
839 {
840 	unsigned long i;
841 	__be64 *hptep;
842 	unsigned long *rmapp;
843 
844 	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
845 	for (;;) {
846 		lock_rmap(rmapp);
847 		if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
848 			unlock_rmap(rmapp);
849 			break;
850 		}
851 
852 		/*
853 		 * To avoid an ABBA deadlock with the HPTE lock bit,
854 		 * we can't spin on the HPTE lock while holding the
855 		 * rmap chain lock.
856 		 */
857 		i = *rmapp & KVMPPC_RMAP_INDEX;
858 		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
859 		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
860 			/* unlock rmap before spinning on the HPTE lock */
861 			unlock_rmap(rmapp);
862 			while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
863 				cpu_relax();
864 			continue;
865 		}
866 
867 		kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
868 		unlock_rmap(rmapp);
869 		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
870 	}
871 	return 0;
872 }
873 
kvm_unmap_hva_hv(struct kvm * kvm,unsigned long hva)874 int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
875 {
876 	hva_handler_fn handler;
877 
878 	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
879 	kvm_handle_hva(kvm, hva, handler);
880 	return 0;
881 }
882 
kvm_unmap_hva_range_hv(struct kvm * kvm,unsigned long start,unsigned long end)883 int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
884 {
885 	hva_handler_fn handler;
886 
887 	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
888 	kvm_handle_hva_range(kvm, start, end, handler);
889 	return 0;
890 }
891 
kvmppc_core_flush_memslot_hv(struct kvm * kvm,struct kvm_memory_slot * memslot)892 void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
893 				  struct kvm_memory_slot *memslot)
894 {
895 	unsigned long gfn;
896 	unsigned long n;
897 	unsigned long *rmapp;
898 
899 	gfn = memslot->base_gfn;
900 	rmapp = memslot->arch.rmap;
901 	for (n = memslot->npages; n; --n, ++gfn) {
902 		if (kvm_is_radix(kvm)) {
903 			kvm_unmap_radix(kvm, memslot, gfn);
904 			continue;
905 		}
906 		/*
907 		 * Testing the present bit without locking is OK because
908 		 * the memslot has been marked invalid already, and hence
909 		 * no new HPTEs referencing this page can be created,
910 		 * thus the present bit can't go from 0 to 1.
911 		 */
912 		if (*rmapp & KVMPPC_RMAP_PRESENT)
913 			kvm_unmap_rmapp(kvm, memslot, gfn);
914 		++rmapp;
915 	}
916 }
917 
kvm_age_rmapp(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long gfn)918 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
919 			 unsigned long gfn)
920 {
921 	struct revmap_entry *rev = kvm->arch.hpt.rev;
922 	unsigned long head, i, j;
923 	__be64 *hptep;
924 	int ret = 0;
925 	unsigned long *rmapp;
926 
927 	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
928  retry:
929 	lock_rmap(rmapp);
930 	if (*rmapp & KVMPPC_RMAP_REFERENCED) {
931 		*rmapp &= ~KVMPPC_RMAP_REFERENCED;
932 		ret = 1;
933 	}
934 	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
935 		unlock_rmap(rmapp);
936 		return ret;
937 	}
938 
939 	i = head = *rmapp & KVMPPC_RMAP_INDEX;
940 	do {
941 		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
942 		j = rev[i].forw;
943 
944 		/* If this HPTE isn't referenced, ignore it */
945 		if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
946 			continue;
947 
948 		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
949 			/* unlock rmap before spinning on the HPTE lock */
950 			unlock_rmap(rmapp);
951 			while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
952 				cpu_relax();
953 			goto retry;
954 		}
955 
956 		/* Now check and modify the HPTE */
957 		if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
958 		    (be64_to_cpu(hptep[1]) & HPTE_R_R)) {
959 			kvmppc_clear_ref_hpte(kvm, hptep, i);
960 			if (!(rev[i].guest_rpte & HPTE_R_R)) {
961 				rev[i].guest_rpte |= HPTE_R_R;
962 				note_hpte_modification(kvm, &rev[i]);
963 			}
964 			ret = 1;
965 		}
966 		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
967 	} while ((i = j) != head);
968 
969 	unlock_rmap(rmapp);
970 	return ret;
971 }
972 
kvm_age_hva_hv(struct kvm * kvm,unsigned long start,unsigned long end)973 int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
974 {
975 	hva_handler_fn handler;
976 
977 	handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
978 	return kvm_handle_hva_range(kvm, start, end, handler);
979 }
980 
kvm_test_age_rmapp(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long gfn)981 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
982 			      unsigned long gfn)
983 {
984 	struct revmap_entry *rev = kvm->arch.hpt.rev;
985 	unsigned long head, i, j;
986 	unsigned long *hp;
987 	int ret = 1;
988 	unsigned long *rmapp;
989 
990 	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
991 	if (*rmapp & KVMPPC_RMAP_REFERENCED)
992 		return 1;
993 
994 	lock_rmap(rmapp);
995 	if (*rmapp & KVMPPC_RMAP_REFERENCED)
996 		goto out;
997 
998 	if (*rmapp & KVMPPC_RMAP_PRESENT) {
999 		i = head = *rmapp & KVMPPC_RMAP_INDEX;
1000 		do {
1001 			hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4));
1002 			j = rev[i].forw;
1003 			if (be64_to_cpu(hp[1]) & HPTE_R_R)
1004 				goto out;
1005 		} while ((i = j) != head);
1006 	}
1007 	ret = 0;
1008 
1009  out:
1010 	unlock_rmap(rmapp);
1011 	return ret;
1012 }
1013 
kvm_test_age_hva_hv(struct kvm * kvm,unsigned long hva)1014 int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
1015 {
1016 	hva_handler_fn handler;
1017 
1018 	handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
1019 	return kvm_handle_hva(kvm, hva, handler);
1020 }
1021 
kvm_set_spte_hva_hv(struct kvm * kvm,unsigned long hva,pte_t pte)1022 void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
1023 {
1024 	hva_handler_fn handler;
1025 
1026 	handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
1027 	kvm_handle_hva(kvm, hva, handler);
1028 }
1029 
vcpus_running(struct kvm * kvm)1030 static int vcpus_running(struct kvm *kvm)
1031 {
1032 	return atomic_read(&kvm->arch.vcpus_running) != 0;
1033 }
1034 
1035 /*
1036  * Returns the number of system pages that are dirty.
1037  * This can be more than 1 if we find a huge-page HPTE.
1038  */
kvm_test_clear_dirty_npages(struct kvm * kvm,unsigned long * rmapp)1039 static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
1040 {
1041 	struct revmap_entry *rev = kvm->arch.hpt.rev;
1042 	unsigned long head, i, j;
1043 	unsigned long n;
1044 	unsigned long v, r;
1045 	__be64 *hptep;
1046 	int npages_dirty = 0;
1047 
1048  retry:
1049 	lock_rmap(rmapp);
1050 	if (*rmapp & KVMPPC_RMAP_CHANGED) {
1051 		long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
1052 			>> KVMPPC_RMAP_CHG_SHIFT;
1053 		*rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
1054 		npages_dirty = 1;
1055 		if (change_order > PAGE_SHIFT)
1056 			npages_dirty = 1ul << (change_order - PAGE_SHIFT);
1057 	}
1058 	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
1059 		unlock_rmap(rmapp);
1060 		return npages_dirty;
1061 	}
1062 
1063 	i = head = *rmapp & KVMPPC_RMAP_INDEX;
1064 	do {
1065 		unsigned long hptep1;
1066 		hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
1067 		j = rev[i].forw;
1068 
1069 		/*
1070 		 * Checking the C (changed) bit here is racy since there
1071 		 * is no guarantee about when the hardware writes it back.
1072 		 * If the HPTE is not writable then it is stable since the
1073 		 * page can't be written to, and we would have done a tlbie
1074 		 * (which forces the hardware to complete any writeback)
1075 		 * when making the HPTE read-only.
1076 		 * If vcpus are running then this call is racy anyway
1077 		 * since the page could get dirtied subsequently, so we
1078 		 * expect there to be a further call which would pick up
1079 		 * any delayed C bit writeback.
1080 		 * Otherwise we need to do the tlbie even if C==0 in
1081 		 * order to pick up any delayed writeback of C.
1082 		 */
1083 		hptep1 = be64_to_cpu(hptep[1]);
1084 		if (!(hptep1 & HPTE_R_C) &&
1085 		    (!hpte_is_writable(hptep1) || vcpus_running(kvm)))
1086 			continue;
1087 
1088 		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
1089 			/* unlock rmap before spinning on the HPTE lock */
1090 			unlock_rmap(rmapp);
1091 			while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
1092 				cpu_relax();
1093 			goto retry;
1094 		}
1095 
1096 		/* Now check and modify the HPTE */
1097 		if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
1098 			__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
1099 			continue;
1100 		}
1101 
1102 		/* need to make it temporarily absent so C is stable */
1103 		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
1104 		kvmppc_invalidate_hpte(kvm, hptep, i);
1105 		v = be64_to_cpu(hptep[0]);
1106 		r = be64_to_cpu(hptep[1]);
1107 		if (r & HPTE_R_C) {
1108 			hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
1109 			if (!(rev[i].guest_rpte & HPTE_R_C)) {
1110 				rev[i].guest_rpte |= HPTE_R_C;
1111 				note_hpte_modification(kvm, &rev[i]);
1112 			}
1113 			n = hpte_page_size(v, r);
1114 			n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
1115 			if (n > npages_dirty)
1116 				npages_dirty = n;
1117 			eieio();
1118 		}
1119 		v &= ~HPTE_V_ABSENT;
1120 		v |= HPTE_V_VALID;
1121 		__unlock_hpte(hptep, v);
1122 	} while ((i = j) != head);
1123 
1124 	unlock_rmap(rmapp);
1125 	return npages_dirty;
1126 }
1127 
kvmppc_harvest_vpa_dirty(struct kvmppc_vpa * vpa,struct kvm_memory_slot * memslot,unsigned long * map)1128 void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
1129 			      struct kvm_memory_slot *memslot,
1130 			      unsigned long *map)
1131 {
1132 	unsigned long gfn;
1133 
1134 	if (!vpa->dirty || !vpa->pinned_addr)
1135 		return;
1136 	gfn = vpa->gpa >> PAGE_SHIFT;
1137 	if (gfn < memslot->base_gfn ||
1138 	    gfn >= memslot->base_gfn + memslot->npages)
1139 		return;
1140 
1141 	vpa->dirty = false;
1142 	if (map)
1143 		__set_bit_le(gfn - memslot->base_gfn, map);
1144 }
1145 
kvmppc_hv_get_dirty_log_hpt(struct kvm * kvm,struct kvm_memory_slot * memslot,unsigned long * map)1146 long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
1147 			struct kvm_memory_slot *memslot, unsigned long *map)
1148 {
1149 	unsigned long i, j;
1150 	unsigned long *rmapp;
1151 
1152 	preempt_disable();
1153 	rmapp = memslot->arch.rmap;
1154 	for (i = 0; i < memslot->npages; ++i) {
1155 		int npages = kvm_test_clear_dirty_npages(kvm, rmapp);
1156 		/*
1157 		 * Note that if npages > 0 then i must be a multiple of npages,
1158 		 * since we always put huge-page HPTEs in the rmap chain
1159 		 * corresponding to their page base address.
1160 		 */
1161 		if (npages && map)
1162 			for (j = i; npages; ++j, --npages)
1163 				__set_bit_le(j, map);
1164 		++rmapp;
1165 	}
1166 	preempt_enable();
1167 	return 0;
1168 }
1169 
kvmppc_pin_guest_page(struct kvm * kvm,unsigned long gpa,unsigned long * nb_ret)1170 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1171 			    unsigned long *nb_ret)
1172 {
1173 	struct kvm_memory_slot *memslot;
1174 	unsigned long gfn = gpa >> PAGE_SHIFT;
1175 	struct page *page, *pages[1];
1176 	int npages;
1177 	unsigned long hva, offset;
1178 	int srcu_idx;
1179 
1180 	srcu_idx = srcu_read_lock(&kvm->srcu);
1181 	memslot = gfn_to_memslot(kvm, gfn);
1182 	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
1183 		goto err;
1184 	hva = gfn_to_hva_memslot(memslot, gfn);
1185 	npages = get_user_pages_fast(hva, 1, 1, pages);
1186 	if (npages < 1)
1187 		goto err;
1188 	page = pages[0];
1189 	srcu_read_unlock(&kvm->srcu, srcu_idx);
1190 
1191 	offset = gpa & (PAGE_SIZE - 1);
1192 	if (nb_ret)
1193 		*nb_ret = PAGE_SIZE - offset;
1194 	return page_address(page) + offset;
1195 
1196  err:
1197 	srcu_read_unlock(&kvm->srcu, srcu_idx);
1198 	return NULL;
1199 }
1200 
kvmppc_unpin_guest_page(struct kvm * kvm,void * va,unsigned long gpa,bool dirty)1201 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
1202 			     bool dirty)
1203 {
1204 	struct page *page = virt_to_page(va);
1205 	struct kvm_memory_slot *memslot;
1206 	unsigned long gfn;
1207 	unsigned long *rmap;
1208 	int srcu_idx;
1209 
1210 	put_page(page);
1211 
1212 	if (!dirty)
1213 		return;
1214 
1215 	/* We need to mark this page dirty in the rmap chain */
1216 	gfn = gpa >> PAGE_SHIFT;
1217 	srcu_idx = srcu_read_lock(&kvm->srcu);
1218 	memslot = gfn_to_memslot(kvm, gfn);
1219 	if (memslot) {
1220 		if (!kvm_is_radix(kvm)) {
1221 			rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
1222 			lock_rmap(rmap);
1223 			*rmap |= KVMPPC_RMAP_CHANGED;
1224 			unlock_rmap(rmap);
1225 		} else if (memslot->dirty_bitmap) {
1226 			mark_page_dirty(kvm, gfn);
1227 		}
1228 	}
1229 	srcu_read_unlock(&kvm->srcu, srcu_idx);
1230 }
1231 
1232 /*
1233  * HPT resizing
1234  */
resize_hpt_allocate(struct kvm_resize_hpt * resize)1235 static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
1236 {
1237 	int rc;
1238 
1239 	rc = kvmppc_allocate_hpt(&resize->hpt, resize->order);
1240 	if (rc < 0)
1241 		return rc;
1242 
1243 	resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n",
1244 			 resize->hpt.virt);
1245 
1246 	return 0;
1247 }
1248 
resize_hpt_rehash_hpte(struct kvm_resize_hpt * resize,unsigned long idx)1249 static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
1250 					    unsigned long idx)
1251 {
1252 	struct kvm *kvm = resize->kvm;
1253 	struct kvm_hpt_info *old = &kvm->arch.hpt;
1254 	struct kvm_hpt_info *new = &resize->hpt;
1255 	unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1;
1256 	unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1;
1257 	__be64 *hptep, *new_hptep;
1258 	unsigned long vpte, rpte, guest_rpte;
1259 	int ret;
1260 	struct revmap_entry *rev;
1261 	unsigned long apsize, psize, avpn, pteg, hash;
1262 	unsigned long new_idx, new_pteg, replace_vpte;
1263 
1264 	hptep = (__be64 *)(old->virt + (idx << 4));
1265 
1266 	/* Guest is stopped, so new HPTEs can't be added or faulted
1267 	 * in, only unmapped or altered by host actions.  So, it's
1268 	 * safe to check this before we take the HPTE lock */
1269 	vpte = be64_to_cpu(hptep[0]);
1270 	if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
1271 		return 0; /* nothing to do */
1272 
1273 	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
1274 		cpu_relax();
1275 
1276 	vpte = be64_to_cpu(hptep[0]);
1277 
1278 	ret = 0;
1279 	if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
1280 		/* Nothing to do */
1281 		goto out;
1282 
1283 	/* Unmap */
1284 	rev = &old->rev[idx];
1285 	guest_rpte = rev->guest_rpte;
1286 
1287 	ret = -EIO;
1288 	apsize = hpte_page_size(vpte, guest_rpte);
1289 	if (!apsize)
1290 		goto out;
1291 
1292 	if (vpte & HPTE_V_VALID) {
1293 		unsigned long gfn = hpte_rpn(guest_rpte, apsize);
1294 		int srcu_idx = srcu_read_lock(&kvm->srcu);
1295 		struct kvm_memory_slot *memslot =
1296 			__gfn_to_memslot(kvm_memslots(kvm), gfn);
1297 
1298 		if (memslot) {
1299 			unsigned long *rmapp;
1300 			rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1301 
1302 			lock_rmap(rmapp);
1303 			kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
1304 			unlock_rmap(rmapp);
1305 		}
1306 
1307 		srcu_read_unlock(&kvm->srcu, srcu_idx);
1308 	}
1309 
1310 	/* Reload PTE after unmap */
1311 	vpte = be64_to_cpu(hptep[0]);
1312 
1313 	BUG_ON(vpte & HPTE_V_VALID);
1314 	BUG_ON(!(vpte & HPTE_V_ABSENT));
1315 
1316 	ret = 0;
1317 	if (!(vpte & HPTE_V_BOLTED))
1318 		goto out;
1319 
1320 	rpte = be64_to_cpu(hptep[1]);
1321 	psize = hpte_base_page_size(vpte, rpte);
1322 	avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23);
1323 	pteg = idx / HPTES_PER_GROUP;
1324 	if (vpte & HPTE_V_SECONDARY)
1325 		pteg = ~pteg;
1326 
1327 	if (!(vpte & HPTE_V_1TB_SEG)) {
1328 		unsigned long offset, vsid;
1329 
1330 		/* We only have 28 - 23 bits of offset in avpn */
1331 		offset = (avpn & 0x1f) << 23;
1332 		vsid = avpn >> 5;
1333 		/* We can find more bits from the pteg value */
1334 		if (psize < (1ULL << 23))
1335 			offset |= ((vsid ^ pteg) & old_hash_mask) * psize;
1336 
1337 		hash = vsid ^ (offset / psize);
1338 	} else {
1339 		unsigned long offset, vsid;
1340 
1341 		/* We only have 40 - 23 bits of seg_off in avpn */
1342 		offset = (avpn & 0x1ffff) << 23;
1343 		vsid = avpn >> 17;
1344 		if (psize < (1ULL << 23))
1345 			offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize;
1346 
1347 		hash = vsid ^ (vsid << 25) ^ (offset / psize);
1348 	}
1349 
1350 	new_pteg = hash & new_hash_mask;
1351 	if (vpte & HPTE_V_SECONDARY)
1352 		new_pteg = ~hash & new_hash_mask;
1353 
1354 	new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
1355 	new_hptep = (__be64 *)(new->virt + (new_idx << 4));
1356 
1357 	replace_vpte = be64_to_cpu(new_hptep[0]);
1358 
1359 	if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1360 		BUG_ON(new->order >= old->order);
1361 
1362 		if (replace_vpte & HPTE_V_BOLTED) {
1363 			if (vpte & HPTE_V_BOLTED)
1364 				/* Bolted collision, nothing we can do */
1365 				ret = -ENOSPC;
1366 			/* Discard the new HPTE */
1367 			goto out;
1368 		}
1369 
1370 		/* Discard the previous HPTE */
1371 	}
1372 
1373 	new_hptep[1] = cpu_to_be64(rpte);
1374 	new->rev[new_idx].guest_rpte = guest_rpte;
1375 	/* No need for a barrier, since new HPT isn't active */
1376 	new_hptep[0] = cpu_to_be64(vpte);
1377 	unlock_hpte(new_hptep, vpte);
1378 
1379 out:
1380 	unlock_hpte(hptep, vpte);
1381 	return ret;
1382 }
1383 
resize_hpt_rehash(struct kvm_resize_hpt * resize)1384 static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
1385 {
1386 	struct kvm *kvm = resize->kvm;
1387 	unsigned  long i;
1388 	int rc;
1389 
1390 	/*
1391 	 * resize_hpt_rehash_hpte() doesn't handle the new-format HPTEs
1392 	 * that POWER9 uses, and could well hit a BUG_ON on POWER9.
1393 	 */
1394 	if (cpu_has_feature(CPU_FTR_ARCH_300))
1395 		return -EIO;
1396 	for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
1397 		rc = resize_hpt_rehash_hpte(resize, i);
1398 		if (rc != 0)
1399 			return rc;
1400 	}
1401 
1402 	return 0;
1403 }
1404 
resize_hpt_pivot(struct kvm_resize_hpt * resize)1405 static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
1406 {
1407 	struct kvm *kvm = resize->kvm;
1408 	struct kvm_hpt_info hpt_tmp;
1409 
1410 	/* Exchange the pending tables in the resize structure with
1411 	 * the active tables */
1412 
1413 	resize_hpt_debug(resize, "resize_hpt_pivot()\n");
1414 
1415 	spin_lock(&kvm->mmu_lock);
1416 	asm volatile("ptesync" : : : "memory");
1417 
1418 	hpt_tmp = kvm->arch.hpt;
1419 	kvmppc_set_hpt(kvm, &resize->hpt);
1420 	resize->hpt = hpt_tmp;
1421 
1422 	spin_unlock(&kvm->mmu_lock);
1423 
1424 	synchronize_srcu_expedited(&kvm->srcu);
1425 
1426 	resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
1427 }
1428 
resize_hpt_release(struct kvm * kvm,struct kvm_resize_hpt * resize)1429 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
1430 {
1431 	if (WARN_ON(!mutex_is_locked(&kvm->lock)))
1432 		return;
1433 
1434 	if (!resize)
1435 		return;
1436 
1437 	if (resize->error != -EBUSY) {
1438 		if (resize->hpt.virt)
1439 			kvmppc_free_hpt(&resize->hpt);
1440 		kfree(resize);
1441 	}
1442 
1443 	if (kvm->arch.resize_hpt == resize)
1444 		kvm->arch.resize_hpt = NULL;
1445 }
1446 
resize_hpt_prepare_work(struct work_struct * work)1447 static void resize_hpt_prepare_work(struct work_struct *work)
1448 {
1449 	struct kvm_resize_hpt *resize = container_of(work,
1450 						     struct kvm_resize_hpt,
1451 						     work);
1452 	struct kvm *kvm = resize->kvm;
1453 	int err = 0;
1454 
1455 	if (WARN_ON(resize->error != -EBUSY))
1456 		return;
1457 
1458 	mutex_lock(&kvm->lock);
1459 
1460 	/* Request is still current? */
1461 	if (kvm->arch.resize_hpt == resize) {
1462 		/* We may request large allocations here:
1463 		 * do not sleep with kvm->lock held for a while.
1464 		 */
1465 		mutex_unlock(&kvm->lock);
1466 
1467 		resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
1468 				 resize->order);
1469 
1470 		err = resize_hpt_allocate(resize);
1471 
1472 		/* We have strict assumption about -EBUSY
1473 		 * when preparing for HPT resize.
1474 		 */
1475 		if (WARN_ON(err == -EBUSY))
1476 			err = -EINPROGRESS;
1477 
1478 		mutex_lock(&kvm->lock);
1479 		/* It is possible that kvm->arch.resize_hpt != resize
1480 		 * after we grab kvm->lock again.
1481 		 */
1482 	}
1483 
1484 	resize->error = err;
1485 
1486 	if (kvm->arch.resize_hpt != resize)
1487 		resize_hpt_release(kvm, resize);
1488 
1489 	mutex_unlock(&kvm->lock);
1490 }
1491 
kvm_vm_ioctl_resize_hpt_prepare(struct kvm * kvm,struct kvm_ppc_resize_hpt * rhpt)1492 long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
1493 				     struct kvm_ppc_resize_hpt *rhpt)
1494 {
1495 	unsigned long flags = rhpt->flags;
1496 	unsigned long shift = rhpt->shift;
1497 	struct kvm_resize_hpt *resize;
1498 	int ret;
1499 
1500 	if (flags != 0)
1501 		return -EINVAL;
1502 
1503 	if (shift && ((shift < 18) || (shift > 46)))
1504 		return -EINVAL;
1505 
1506 	mutex_lock(&kvm->lock);
1507 
1508 	resize = kvm->arch.resize_hpt;
1509 
1510 	if (resize) {
1511 		if (resize->order == shift) {
1512 			/* Suitable resize in progress? */
1513 			ret = resize->error;
1514 			if (ret == -EBUSY)
1515 				ret = 100; /* estimated time in ms */
1516 			else if (ret)
1517 				resize_hpt_release(kvm, resize);
1518 
1519 			goto out;
1520 		}
1521 
1522 		/* not suitable, cancel it */
1523 		resize_hpt_release(kvm, resize);
1524 	}
1525 
1526 	ret = 0;
1527 	if (!shift)
1528 		goto out; /* nothing to do */
1529 
1530 	/* start new resize */
1531 
1532 	resize = kzalloc(sizeof(*resize), GFP_KERNEL);
1533 	if (!resize) {
1534 		ret = -ENOMEM;
1535 		goto out;
1536 	}
1537 
1538 	resize->error = -EBUSY;
1539 	resize->order = shift;
1540 	resize->kvm = kvm;
1541 	INIT_WORK(&resize->work, resize_hpt_prepare_work);
1542 	kvm->arch.resize_hpt = resize;
1543 
1544 	schedule_work(&resize->work);
1545 
1546 	ret = 100; /* estimated time in ms */
1547 
1548 out:
1549 	mutex_unlock(&kvm->lock);
1550 	return ret;
1551 }
1552 
resize_hpt_boot_vcpu(void * opaque)1553 static void resize_hpt_boot_vcpu(void *opaque)
1554 {
1555 	/* Nothing to do, just force a KVM exit */
1556 }
1557 
kvm_vm_ioctl_resize_hpt_commit(struct kvm * kvm,struct kvm_ppc_resize_hpt * rhpt)1558 long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
1559 				    struct kvm_ppc_resize_hpt *rhpt)
1560 {
1561 	unsigned long flags = rhpt->flags;
1562 	unsigned long shift = rhpt->shift;
1563 	struct kvm_resize_hpt *resize;
1564 	long ret;
1565 
1566 	if (flags != 0)
1567 		return -EINVAL;
1568 
1569 	if (shift && ((shift < 18) || (shift > 46)))
1570 		return -EINVAL;
1571 
1572 	mutex_lock(&kvm->lock);
1573 
1574 	resize = kvm->arch.resize_hpt;
1575 
1576 	/* This shouldn't be possible */
1577 	ret = -EIO;
1578 	if (WARN_ON(!kvm->arch.hpte_setup_done))
1579 		goto out_no_hpt;
1580 
1581 	/* Stop VCPUs from running while we mess with the HPT */
1582 	kvm->arch.hpte_setup_done = 0;
1583 	smp_mb();
1584 
1585 	/* Boot all CPUs out of the guest so they re-read
1586 	 * hpte_setup_done */
1587 	on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
1588 
1589 	ret = -ENXIO;
1590 	if (!resize || (resize->order != shift))
1591 		goto out;
1592 
1593 	ret = resize->error;
1594 	if (ret)
1595 		goto out;
1596 
1597 	ret = resize_hpt_rehash(resize);
1598 	if (ret)
1599 		goto out;
1600 
1601 	resize_hpt_pivot(resize);
1602 
1603 out:
1604 	/* Let VCPUs run again */
1605 	kvm->arch.hpte_setup_done = 1;
1606 	smp_mb();
1607 out_no_hpt:
1608 	resize_hpt_release(kvm, resize);
1609 	mutex_unlock(&kvm->lock);
1610 	return ret;
1611 }
1612 
1613 /*
1614  * Functions for reading and writing the hash table via reads and
1615  * writes on a file descriptor.
1616  *
1617  * Reads return the guest view of the hash table, which has to be
1618  * pieced together from the real hash table and the guest_rpte
1619  * values in the revmap array.
1620  *
1621  * On writes, each HPTE written is considered in turn, and if it
1622  * is valid, it is written to the HPT as if an H_ENTER with the
1623  * exact flag set was done.  When the invalid count is non-zero
1624  * in the header written to the stream, the kernel will make
1625  * sure that that many HPTEs are invalid, and invalidate them
1626  * if not.
1627  */
1628 
1629 struct kvm_htab_ctx {
1630 	unsigned long	index;
1631 	unsigned long	flags;
1632 	struct kvm	*kvm;
1633 	int		first_pass;
1634 };
1635 
1636 #define HPTE_SIZE	(2 * sizeof(unsigned long))
1637 
1638 /*
1639  * Returns 1 if this HPT entry has been modified or has pending
1640  * R/C bit changes.
1641  */
hpte_dirty(struct revmap_entry * revp,__be64 * hptp)1642 static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
1643 {
1644 	unsigned long rcbits_unset;
1645 
1646 	if (revp->guest_rpte & HPTE_GR_MODIFIED)
1647 		return 1;
1648 
1649 	/* Also need to consider changes in reference and changed bits */
1650 	rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1651 	if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
1652 	    (be64_to_cpu(hptp[1]) & rcbits_unset))
1653 		return 1;
1654 
1655 	return 0;
1656 }
1657 
record_hpte(unsigned long flags,__be64 * hptp,unsigned long * hpte,struct revmap_entry * revp,int want_valid,int first_pass)1658 static long record_hpte(unsigned long flags, __be64 *hptp,
1659 			unsigned long *hpte, struct revmap_entry *revp,
1660 			int want_valid, int first_pass)
1661 {
1662 	unsigned long v, r, hr;
1663 	unsigned long rcbits_unset;
1664 	int ok = 1;
1665 	int valid, dirty;
1666 
1667 	/* Unmodified entries are uninteresting except on the first pass */
1668 	dirty = hpte_dirty(revp, hptp);
1669 	if (!first_pass && !dirty)
1670 		return 0;
1671 
1672 	valid = 0;
1673 	if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1674 		valid = 1;
1675 		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
1676 		    !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
1677 			valid = 0;
1678 	}
1679 	if (valid != want_valid)
1680 		return 0;
1681 
1682 	v = r = 0;
1683 	if (valid || dirty) {
1684 		/* lock the HPTE so it's stable and read it */
1685 		preempt_disable();
1686 		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1687 			cpu_relax();
1688 		v = be64_to_cpu(hptp[0]);
1689 		hr = be64_to_cpu(hptp[1]);
1690 		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1691 			v = hpte_new_to_old_v(v, hr);
1692 			hr = hpte_new_to_old_r(hr);
1693 		}
1694 
1695 		/* re-evaluate valid and dirty from synchronized HPTE value */
1696 		valid = !!(v & HPTE_V_VALID);
1697 		dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1698 
1699 		/* Harvest R and C into guest view if necessary */
1700 		rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1701 		if (valid && (rcbits_unset & hr)) {
1702 			revp->guest_rpte |= (hr &
1703 				(HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
1704 			dirty = 1;
1705 		}
1706 
1707 		if (v & HPTE_V_ABSENT) {
1708 			v &= ~HPTE_V_ABSENT;
1709 			v |= HPTE_V_VALID;
1710 			valid = 1;
1711 		}
1712 		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
1713 			valid = 0;
1714 
1715 		r = revp->guest_rpte;
1716 		/* only clear modified if this is the right sort of entry */
1717 		if (valid == want_valid && dirty) {
1718 			r &= ~HPTE_GR_MODIFIED;
1719 			revp->guest_rpte = r;
1720 		}
1721 		unlock_hpte(hptp, be64_to_cpu(hptp[0]));
1722 		preempt_enable();
1723 		if (!(valid == want_valid && (first_pass || dirty)))
1724 			ok = 0;
1725 	}
1726 	hpte[0] = cpu_to_be64(v);
1727 	hpte[1] = cpu_to_be64(r);
1728 	return ok;
1729 }
1730 
kvm_htab_read(struct file * file,char __user * buf,size_t count,loff_t * ppos)1731 static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1732 			     size_t count, loff_t *ppos)
1733 {
1734 	struct kvm_htab_ctx *ctx = file->private_data;
1735 	struct kvm *kvm = ctx->kvm;
1736 	struct kvm_get_htab_header hdr;
1737 	__be64 *hptp;
1738 	struct revmap_entry *revp;
1739 	unsigned long i, nb, nw;
1740 	unsigned long __user *lbuf;
1741 	struct kvm_get_htab_header __user *hptr;
1742 	unsigned long flags;
1743 	int first_pass;
1744 	unsigned long hpte[2];
1745 
1746 	if (!access_ok(VERIFY_WRITE, buf, count))
1747 		return -EFAULT;
1748 
1749 	first_pass = ctx->first_pass;
1750 	flags = ctx->flags;
1751 
1752 	i = ctx->index;
1753 	hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
1754 	revp = kvm->arch.hpt.rev + i;
1755 	lbuf = (unsigned long __user *)buf;
1756 
1757 	nb = 0;
1758 	while (nb + sizeof(hdr) + HPTE_SIZE < count) {
1759 		/* Initialize header */
1760 		hptr = (struct kvm_get_htab_header __user *)buf;
1761 		hdr.n_valid = 0;
1762 		hdr.n_invalid = 0;
1763 		nw = nb;
1764 		nb += sizeof(hdr);
1765 		lbuf = (unsigned long __user *)(buf + sizeof(hdr));
1766 
1767 		/* Skip uninteresting entries, i.e. clean on not-first pass */
1768 		if (!first_pass) {
1769 			while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1770 			       !hpte_dirty(revp, hptp)) {
1771 				++i;
1772 				hptp += 2;
1773 				++revp;
1774 			}
1775 		}
1776 		hdr.index = i;
1777 
1778 		/* Grab a series of valid entries */
1779 		while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1780 		       hdr.n_valid < 0xffff &&
1781 		       nb + HPTE_SIZE < count &&
1782 		       record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
1783 			/* valid entry, write it out */
1784 			++hdr.n_valid;
1785 			if (__put_user(hpte[0], lbuf) ||
1786 			    __put_user(hpte[1], lbuf + 1))
1787 				return -EFAULT;
1788 			nb += HPTE_SIZE;
1789 			lbuf += 2;
1790 			++i;
1791 			hptp += 2;
1792 			++revp;
1793 		}
1794 		/* Now skip invalid entries while we can */
1795 		while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1796 		       hdr.n_invalid < 0xffff &&
1797 		       record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
1798 			/* found an invalid entry */
1799 			++hdr.n_invalid;
1800 			++i;
1801 			hptp += 2;
1802 			++revp;
1803 		}
1804 
1805 		if (hdr.n_valid || hdr.n_invalid) {
1806 			/* write back the header */
1807 			if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
1808 				return -EFAULT;
1809 			nw = nb;
1810 			buf = (char __user *)lbuf;
1811 		} else {
1812 			nb = nw;
1813 		}
1814 
1815 		/* Check if we've wrapped around the hash table */
1816 		if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
1817 			i = 0;
1818 			ctx->first_pass = 0;
1819 			break;
1820 		}
1821 	}
1822 
1823 	ctx->index = i;
1824 
1825 	return nb;
1826 }
1827 
kvm_htab_write(struct file * file,const char __user * buf,size_t count,loff_t * ppos)1828 static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1829 			      size_t count, loff_t *ppos)
1830 {
1831 	struct kvm_htab_ctx *ctx = file->private_data;
1832 	struct kvm *kvm = ctx->kvm;
1833 	struct kvm_get_htab_header hdr;
1834 	unsigned long i, j;
1835 	unsigned long v, r;
1836 	unsigned long __user *lbuf;
1837 	__be64 *hptp;
1838 	unsigned long tmp[2];
1839 	ssize_t nb;
1840 	long int err, ret;
1841 	int hpte_setup;
1842 
1843 	if (!access_ok(VERIFY_READ, buf, count))
1844 		return -EFAULT;
1845 
1846 	/* lock out vcpus from running while we're doing this */
1847 	mutex_lock(&kvm->lock);
1848 	hpte_setup = kvm->arch.hpte_setup_done;
1849 	if (hpte_setup) {
1850 		kvm->arch.hpte_setup_done = 0;	/* temporarily */
1851 		/* order hpte_setup_done vs. vcpus_running */
1852 		smp_mb();
1853 		if (atomic_read(&kvm->arch.vcpus_running)) {
1854 			kvm->arch.hpte_setup_done = 1;
1855 			mutex_unlock(&kvm->lock);
1856 			return -EBUSY;
1857 		}
1858 	}
1859 
1860 	err = 0;
1861 	for (nb = 0; nb + sizeof(hdr) <= count; ) {
1862 		err = -EFAULT;
1863 		if (__copy_from_user(&hdr, buf, sizeof(hdr)))
1864 			break;
1865 
1866 		err = 0;
1867 		if (nb + hdr.n_valid * HPTE_SIZE > count)
1868 			break;
1869 
1870 		nb += sizeof(hdr);
1871 		buf += sizeof(hdr);
1872 
1873 		err = -EINVAL;
1874 		i = hdr.index;
1875 		if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) ||
1876 		    i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt))
1877 			break;
1878 
1879 		hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
1880 		lbuf = (unsigned long __user *)buf;
1881 		for (j = 0; j < hdr.n_valid; ++j) {
1882 			__be64 hpte_v;
1883 			__be64 hpte_r;
1884 
1885 			err = -EFAULT;
1886 			if (__get_user(hpte_v, lbuf) ||
1887 			    __get_user(hpte_r, lbuf + 1))
1888 				goto out;
1889 			v = be64_to_cpu(hpte_v);
1890 			r = be64_to_cpu(hpte_r);
1891 			err = -EINVAL;
1892 			if (!(v & HPTE_V_VALID))
1893 				goto out;
1894 			lbuf += 2;
1895 			nb += HPTE_SIZE;
1896 
1897 			if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
1898 				kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1899 			err = -EIO;
1900 			ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
1901 							 tmp);
1902 			if (ret != H_SUCCESS) {
1903 				pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
1904 				       "r=%lx\n", ret, i, v, r);
1905 				goto out;
1906 			}
1907 			if (!hpte_setup && is_vrma_hpte(v)) {
1908 				unsigned long psize = hpte_base_page_size(v, r);
1909 				unsigned long senc = slb_pgsize_encoding(psize);
1910 				unsigned long lpcr;
1911 
1912 				kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
1913 					(VRMA_VSID << SLB_VSID_SHIFT_1T);
1914 				lpcr = senc << (LPCR_VRMASD_SH - 4);
1915 				kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
1916 				hpte_setup = 1;
1917 			}
1918 			++i;
1919 			hptp += 2;
1920 		}
1921 
1922 		for (j = 0; j < hdr.n_invalid; ++j) {
1923 			if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
1924 				kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1925 			++i;
1926 			hptp += 2;
1927 		}
1928 		err = 0;
1929 	}
1930 
1931  out:
1932 	/* Order HPTE updates vs. hpte_setup_done */
1933 	smp_wmb();
1934 	kvm->arch.hpte_setup_done = hpte_setup;
1935 	mutex_unlock(&kvm->lock);
1936 
1937 	if (err)
1938 		return err;
1939 	return nb;
1940 }
1941 
kvm_htab_release(struct inode * inode,struct file * filp)1942 static int kvm_htab_release(struct inode *inode, struct file *filp)
1943 {
1944 	struct kvm_htab_ctx *ctx = filp->private_data;
1945 
1946 	filp->private_data = NULL;
1947 	if (!(ctx->flags & KVM_GET_HTAB_WRITE))
1948 		atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
1949 	kvm_put_kvm(ctx->kvm);
1950 	kfree(ctx);
1951 	return 0;
1952 }
1953 
1954 static const struct file_operations kvm_htab_fops = {
1955 	.read		= kvm_htab_read,
1956 	.write		= kvm_htab_write,
1957 	.llseek		= default_llseek,
1958 	.release	= kvm_htab_release,
1959 };
1960 
kvm_vm_ioctl_get_htab_fd(struct kvm * kvm,struct kvm_get_htab_fd * ghf)1961 int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1962 {
1963 	int ret;
1964 	struct kvm_htab_ctx *ctx;
1965 	int rwflag;
1966 
1967 	/* reject flags we don't recognize */
1968 	if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
1969 		return -EINVAL;
1970 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1971 	if (!ctx)
1972 		return -ENOMEM;
1973 	kvm_get_kvm(kvm);
1974 	ctx->kvm = kvm;
1975 	ctx->index = ghf->start_index;
1976 	ctx->flags = ghf->flags;
1977 	ctx->first_pass = 1;
1978 
1979 	rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1980 	ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
1981 	if (ret < 0) {
1982 		kfree(ctx);
1983 		kvm_put_kvm(kvm);
1984 		return ret;
1985 	}
1986 
1987 	if (rwflag == O_RDONLY) {
1988 		mutex_lock(&kvm->slots_lock);
1989 		atomic_inc(&kvm->arch.hpte_mod_interest);
1990 		/* make sure kvmppc_do_h_enter etc. see the increment */
1991 		synchronize_srcu_expedited(&kvm->srcu);
1992 		mutex_unlock(&kvm->slots_lock);
1993 	}
1994 
1995 	return ret;
1996 }
1997 
1998 struct debugfs_htab_state {
1999 	struct kvm	*kvm;
2000 	struct mutex	mutex;
2001 	unsigned long	hpt_index;
2002 	int		chars_left;
2003 	int		buf_index;
2004 	char		buf[64];
2005 };
2006 
debugfs_htab_open(struct inode * inode,struct file * file)2007 static int debugfs_htab_open(struct inode *inode, struct file *file)
2008 {
2009 	struct kvm *kvm = inode->i_private;
2010 	struct debugfs_htab_state *p;
2011 
2012 	p = kzalloc(sizeof(*p), GFP_KERNEL);
2013 	if (!p)
2014 		return -ENOMEM;
2015 
2016 	kvm_get_kvm(kvm);
2017 	p->kvm = kvm;
2018 	mutex_init(&p->mutex);
2019 	file->private_data = p;
2020 
2021 	return nonseekable_open(inode, file);
2022 }
2023 
debugfs_htab_release(struct inode * inode,struct file * file)2024 static int debugfs_htab_release(struct inode *inode, struct file *file)
2025 {
2026 	struct debugfs_htab_state *p = file->private_data;
2027 
2028 	kvm_put_kvm(p->kvm);
2029 	kfree(p);
2030 	return 0;
2031 }
2032 
debugfs_htab_read(struct file * file,char __user * buf,size_t len,loff_t * ppos)2033 static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
2034 				 size_t len, loff_t *ppos)
2035 {
2036 	struct debugfs_htab_state *p = file->private_data;
2037 	ssize_t ret, r;
2038 	unsigned long i, n;
2039 	unsigned long v, hr, gr;
2040 	struct kvm *kvm;
2041 	__be64 *hptp;
2042 
2043 	ret = mutex_lock_interruptible(&p->mutex);
2044 	if (ret)
2045 		return ret;
2046 
2047 	if (p->chars_left) {
2048 		n = p->chars_left;
2049 		if (n > len)
2050 			n = len;
2051 		r = copy_to_user(buf, p->buf + p->buf_index, n);
2052 		n -= r;
2053 		p->chars_left -= n;
2054 		p->buf_index += n;
2055 		buf += n;
2056 		len -= n;
2057 		ret = n;
2058 		if (r) {
2059 			if (!n)
2060 				ret = -EFAULT;
2061 			goto out;
2062 		}
2063 	}
2064 
2065 	kvm = p->kvm;
2066 	i = p->hpt_index;
2067 	hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
2068 	for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
2069 	     ++i, hptp += 2) {
2070 		if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
2071 			continue;
2072 
2073 		/* lock the HPTE so it's stable and read it */
2074 		preempt_disable();
2075 		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
2076 			cpu_relax();
2077 		v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
2078 		hr = be64_to_cpu(hptp[1]);
2079 		gr = kvm->arch.hpt.rev[i].guest_rpte;
2080 		unlock_hpte(hptp, v);
2081 		preempt_enable();
2082 
2083 		if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
2084 			continue;
2085 
2086 		n = scnprintf(p->buf, sizeof(p->buf),
2087 			      "%6lx %.16lx %.16lx %.16lx\n",
2088 			      i, v, hr, gr);
2089 		p->chars_left = n;
2090 		if (n > len)
2091 			n = len;
2092 		r = copy_to_user(buf, p->buf, n);
2093 		n -= r;
2094 		p->chars_left -= n;
2095 		p->buf_index = n;
2096 		buf += n;
2097 		len -= n;
2098 		ret += n;
2099 		if (r) {
2100 			if (!ret)
2101 				ret = -EFAULT;
2102 			goto out;
2103 		}
2104 	}
2105 	p->hpt_index = i;
2106 
2107  out:
2108 	mutex_unlock(&p->mutex);
2109 	return ret;
2110 }
2111 
debugfs_htab_write(struct file * file,const char __user * buf,size_t len,loff_t * ppos)2112 static ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
2113 			   size_t len, loff_t *ppos)
2114 {
2115 	return -EACCES;
2116 }
2117 
2118 static const struct file_operations debugfs_htab_fops = {
2119 	.owner	 = THIS_MODULE,
2120 	.open	 = debugfs_htab_open,
2121 	.release = debugfs_htab_release,
2122 	.read	 = debugfs_htab_read,
2123 	.write	 = debugfs_htab_write,
2124 	.llseek	 = generic_file_llseek,
2125 };
2126 
kvmppc_mmu_debugfs_init(struct kvm * kvm)2127 void kvmppc_mmu_debugfs_init(struct kvm *kvm)
2128 {
2129 	kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
2130 						    kvm->arch.debugfs_dir, kvm,
2131 						    &debugfs_htab_fops);
2132 }
2133 
kvmppc_mmu_book3s_hv_init(struct kvm_vcpu * vcpu)2134 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
2135 {
2136 	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
2137 
2138 	vcpu->arch.slb_nr = 32;		/* POWER7/POWER8 */
2139 
2140 	if (kvm_is_radix(vcpu->kvm))
2141 		mmu->xlate = kvmppc_mmu_radix_xlate;
2142 	else
2143 		mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
2144 	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
2145 
2146 	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
2147 }
2148