• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * Copyright (C) 2006 Qumranet, Inc.
9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Avi Kivity   <avi@qumranet.com>
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  */
15 
16 #include <kvm/iodev.h>
17 
18 #include <linux/kvm_host.h>
19 #include <linux/kvm.h>
20 #include <linux/module.h>
21 #include <linux/errno.h>
22 #include <linux/percpu.h>
23 #include <linux/mm.h>
24 #include <linux/miscdevice.h>
25 #include <linux/vmalloc.h>
26 #include <linux/reboot.h>
27 #include <linux/debugfs.h>
28 #include <linux/highmem.h>
29 #include <linux/file.h>
30 #include <linux/syscore_ops.h>
31 #include <linux/cpu.h>
32 #include <linux/sched/signal.h>
33 #include <linux/sched/mm.h>
34 #include <linux/sched/stat.h>
35 #include <linux/cpumask.h>
36 #include <linux/smp.h>
37 #include <linux/anon_inodes.h>
38 #include <linux/profile.h>
39 #include <linux/kvm_para.h>
40 #include <linux/pagemap.h>
41 #include <linux/mman.h>
42 #include <linux/swap.h>
43 #include <linux/bitops.h>
44 #include <linux/spinlock.h>
45 #include <linux/compat.h>
46 #include <linux/srcu.h>
47 #include <linux/hugetlb.h>
48 #include <linux/slab.h>
49 #include <linux/sort.h>
50 #include <linux/bsearch.h>
51 #include <linux/io.h>
52 #include <linux/lockdep.h>
53 #include <linux/kthread.h>
54 
55 #include <asm/processor.h>
56 #include <asm/ioctl.h>
57 #include <linux/uaccess.h>
58 #include <asm/pgtable.h>
59 
60 #include "coalesced_mmio.h"
61 #include "async_pf.h"
62 #include "vfio.h"
63 
64 #define CREATE_TRACE_POINTS
65 #include <trace/events/kvm.h>
66 
67 /* Worst case buffer size needed for holding an integer. */
68 #define ITOA_MAX_LEN 12
69 
70 MODULE_AUTHOR("Qumranet");
71 MODULE_LICENSE("GPL");
72 
73 /* Architectures should define their poll value according to the halt latency */
74 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
75 module_param(halt_poll_ns, uint, 0644);
76 EXPORT_SYMBOL_GPL(halt_poll_ns);
77 
78 /* Default doubles per-vcpu halt_poll_ns. */
79 unsigned int halt_poll_ns_grow = 2;
80 module_param(halt_poll_ns_grow, uint, 0644);
81 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
82 
83 /* The start value to grow halt_poll_ns from */
84 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
85 module_param(halt_poll_ns_grow_start, uint, 0644);
86 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
87 
88 /* Default resets per-vcpu halt_poll_ns . */
89 unsigned int halt_poll_ns_shrink;
90 module_param(halt_poll_ns_shrink, uint, 0644);
91 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
92 
93 /*
94  * Ordering of locks:
95  *
96  *	kvm->lock --> kvm->slots_lock --> kvm->irq_lock
97  */
98 
99 DEFINE_MUTEX(kvm_lock);
100 static DEFINE_RAW_SPINLOCK(kvm_count_lock);
101 LIST_HEAD(vm_list);
102 
103 static cpumask_var_t cpus_hardware_enabled;
104 static int kvm_usage_count;
105 static atomic_t hardware_enable_failed;
106 
107 struct kmem_cache *kvm_vcpu_cache;
108 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
109 
110 static __read_mostly struct preempt_ops kvm_preempt_ops;
111 
112 struct dentry *kvm_debugfs_dir;
113 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
114 
115 static int kvm_debugfs_num_entries;
116 static const struct file_operations *stat_fops_per_vm[];
117 
118 static struct file_operations kvm_chardev_ops;
119 
120 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
121 			   unsigned long arg);
122 #ifdef CONFIG_KVM_COMPAT
123 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
124 				  unsigned long arg);
125 #define KVM_COMPAT(c)	.compat_ioctl	= (c)
126 #else
127 /*
128  * For architectures that don't implement a compat infrastructure,
129  * adopt a double line of defense:
130  * - Prevent a compat task from opening /dev/kvm
131  * - If the open has been done by a 64bit task, and the KVM fd
132  *   passed to a compat task, let the ioctls fail.
133  */
kvm_no_compat_ioctl(struct file * file,unsigned int ioctl,unsigned long arg)134 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
135 				unsigned long arg) { return -EINVAL; }
136 
kvm_no_compat_open(struct inode * inode,struct file * file)137 static int kvm_no_compat_open(struct inode *inode, struct file *file)
138 {
139 	return is_compat_task() ? -ENODEV : 0;
140 }
141 #define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
142 			.open		= kvm_no_compat_open
143 #endif
144 static int hardware_enable_all(void);
145 static void hardware_disable_all(void);
146 
147 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
148 
149 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
150 
151 __visible bool kvm_rebooting;
152 EXPORT_SYMBOL_GPL(kvm_rebooting);
153 
154 static bool largepages_enabled = true;
155 
156 #define KVM_EVENT_CREATE_VM 0
157 #define KVM_EVENT_DESTROY_VM 1
158 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
159 static unsigned long long kvm_createvm_count;
160 static unsigned long long kvm_active_vms;
161 
kvm_arch_mmu_notifier_invalidate_range(struct kvm * kvm,unsigned long start,unsigned long end)162 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
163 						   unsigned long start, unsigned long end)
164 {
165 }
166 
kvm_is_zone_device_pfn(kvm_pfn_t pfn)167 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
168 {
169 	/*
170 	 * The metadata used by is_zone_device_page() to determine whether or
171 	 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
172 	 * the device has been pinned, e.g. by get_user_pages().  WARN if the
173 	 * page_count() is zero to help detect bad usage of this helper.
174 	 */
175 	if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
176 		return false;
177 
178 	return is_zone_device_page(pfn_to_page(pfn));
179 }
180 
kvm_is_reserved_pfn(kvm_pfn_t pfn)181 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
182 {
183 	/*
184 	 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
185 	 * perspective they are "normal" pages, albeit with slightly different
186 	 * usage rules.
187 	 */
188 	if (pfn_valid(pfn))
189 		return PageReserved(pfn_to_page(pfn)) &&
190 		       !is_zero_pfn(pfn) &&
191 		       !kvm_is_zone_device_pfn(pfn);
192 
193 	return true;
194 }
195 
196 /*
197  * Switches to specified vcpu, until a matching vcpu_put()
198  */
vcpu_load(struct kvm_vcpu * vcpu)199 void vcpu_load(struct kvm_vcpu *vcpu)
200 {
201 	int cpu = get_cpu();
202 	preempt_notifier_register(&vcpu->preempt_notifier);
203 	kvm_arch_vcpu_load(vcpu, cpu);
204 	put_cpu();
205 }
206 EXPORT_SYMBOL_GPL(vcpu_load);
207 
vcpu_put(struct kvm_vcpu * vcpu)208 void vcpu_put(struct kvm_vcpu *vcpu)
209 {
210 	preempt_disable();
211 	kvm_arch_vcpu_put(vcpu);
212 	preempt_notifier_unregister(&vcpu->preempt_notifier);
213 	preempt_enable();
214 }
215 EXPORT_SYMBOL_GPL(vcpu_put);
216 
217 /* TODO: merge with kvm_arch_vcpu_should_kick */
kvm_request_needs_ipi(struct kvm_vcpu * vcpu,unsigned req)218 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
219 {
220 	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
221 
222 	/*
223 	 * We need to wait for the VCPU to reenable interrupts and get out of
224 	 * READING_SHADOW_PAGE_TABLES mode.
225 	 */
226 	if (req & KVM_REQUEST_WAIT)
227 		return mode != OUTSIDE_GUEST_MODE;
228 
229 	/*
230 	 * Need to kick a running VCPU, but otherwise there is nothing to do.
231 	 */
232 	return mode == IN_GUEST_MODE;
233 }
234 
ack_flush(void * _completed)235 static void ack_flush(void *_completed)
236 {
237 }
238 
kvm_kick_many_cpus(const struct cpumask * cpus,bool wait)239 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
240 {
241 	if (unlikely(!cpus))
242 		cpus = cpu_online_mask;
243 
244 	if (cpumask_empty(cpus))
245 		return false;
246 
247 	smp_call_function_many(cpus, ack_flush, NULL, wait);
248 	return true;
249 }
250 
kvm_make_vcpus_request_mask(struct kvm * kvm,unsigned int req,unsigned long * vcpu_bitmap,cpumask_var_t tmp)251 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
252 				 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
253 {
254 	int i, cpu, me;
255 	struct kvm_vcpu *vcpu;
256 	bool called;
257 
258 	me = get_cpu();
259 
260 	kvm_for_each_vcpu(i, vcpu, kvm) {
261 		if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
262 			continue;
263 
264 		kvm_make_request(req, vcpu);
265 		cpu = vcpu->cpu;
266 
267 		if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
268 			continue;
269 
270 		if (tmp != NULL && cpu != -1 && cpu != me &&
271 		    kvm_request_needs_ipi(vcpu, req))
272 			__cpumask_set_cpu(cpu, tmp);
273 	}
274 
275 	called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
276 	put_cpu();
277 
278 	return called;
279 }
280 
kvm_make_all_cpus_request(struct kvm * kvm,unsigned int req)281 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
282 {
283 	cpumask_var_t cpus;
284 	bool called;
285 
286 	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
287 
288 	called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus);
289 
290 	free_cpumask_var(cpus);
291 	return called;
292 }
293 
294 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
kvm_flush_remote_tlbs(struct kvm * kvm)295 void kvm_flush_remote_tlbs(struct kvm *kvm)
296 {
297 	/*
298 	 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
299 	 * kvm_make_all_cpus_request.
300 	 */
301 	long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
302 
303 	/*
304 	 * We want to publish modifications to the page tables before reading
305 	 * mode. Pairs with a memory barrier in arch-specific code.
306 	 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
307 	 * and smp_mb in walk_shadow_page_lockless_begin/end.
308 	 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
309 	 *
310 	 * There is already an smp_mb__after_atomic() before
311 	 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
312 	 * barrier here.
313 	 */
314 	if (!kvm_arch_flush_remote_tlb(kvm)
315 	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
316 		++kvm->stat.remote_tlb_flush;
317 	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
318 }
319 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
320 #endif
321 
kvm_reload_remote_mmus(struct kvm * kvm)322 void kvm_reload_remote_mmus(struct kvm *kvm)
323 {
324 	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
325 }
326 
kvm_vcpu_init(struct kvm_vcpu * vcpu,struct kvm * kvm,unsigned id)327 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
328 {
329 	struct page *page;
330 	int r;
331 
332 	mutex_init(&vcpu->mutex);
333 	vcpu->cpu = -1;
334 	vcpu->kvm = kvm;
335 	vcpu->vcpu_id = id;
336 	vcpu->pid = NULL;
337 	init_swait_queue_head(&vcpu->wq);
338 	kvm_async_pf_vcpu_init(vcpu);
339 
340 	vcpu->pre_pcpu = -1;
341 	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
342 
343 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
344 	if (!page) {
345 		r = -ENOMEM;
346 		goto fail;
347 	}
348 	vcpu->run = page_address(page);
349 
350 	kvm_vcpu_set_in_spin_loop(vcpu, false);
351 	kvm_vcpu_set_dy_eligible(vcpu, false);
352 	vcpu->preempted = false;
353 	vcpu->ready = false;
354 
355 	r = kvm_arch_vcpu_init(vcpu);
356 	if (r < 0)
357 		goto fail_free_run;
358 	return 0;
359 
360 fail_free_run:
361 	free_page((unsigned long)vcpu->run);
362 fail:
363 	return r;
364 }
365 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
366 
kvm_vcpu_uninit(struct kvm_vcpu * vcpu)367 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
368 {
369 	/*
370 	 * no need for rcu_read_lock as VCPU_RUN is the only place that
371 	 * will change the vcpu->pid pointer and on uninit all file
372 	 * descriptors are already gone.
373 	 */
374 	put_pid(rcu_dereference_protected(vcpu->pid, 1));
375 	kvm_arch_vcpu_uninit(vcpu);
376 	free_page((unsigned long)vcpu->run);
377 }
378 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
379 
380 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_to_kvm(struct mmu_notifier * mn)381 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
382 {
383 	return container_of(mn, struct kvm, mmu_notifier);
384 }
385 
kvm_mmu_notifier_invalidate_range(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)386 static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
387 					      struct mm_struct *mm,
388 					      unsigned long start, unsigned long end)
389 {
390 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
391 	int idx;
392 
393 	idx = srcu_read_lock(&kvm->srcu);
394 	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
395 	srcu_read_unlock(&kvm->srcu, idx);
396 }
397 
kvm_mmu_notifier_change_pte(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long address,pte_t pte)398 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
399 					struct mm_struct *mm,
400 					unsigned long address,
401 					pte_t pte)
402 {
403 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
404 	int idx;
405 
406 	idx = srcu_read_lock(&kvm->srcu);
407 	spin_lock(&kvm->mmu_lock);
408 	kvm->mmu_notifier_seq++;
409 
410 	if (kvm_set_spte_hva(kvm, address, pte))
411 		kvm_flush_remote_tlbs(kvm);
412 
413 	spin_unlock(&kvm->mmu_lock);
414 	srcu_read_unlock(&kvm->srcu, idx);
415 }
416 
kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier * mn,const struct mmu_notifier_range * range)417 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
418 					const struct mmu_notifier_range *range)
419 {
420 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
421 	int need_tlb_flush = 0, idx;
422 
423 	idx = srcu_read_lock(&kvm->srcu);
424 	spin_lock(&kvm->mmu_lock);
425 	/*
426 	 * The count increase must become visible at unlock time as no
427 	 * spte can be established without taking the mmu_lock and
428 	 * count is also read inside the mmu_lock critical section.
429 	 */
430 	kvm->mmu_notifier_count++;
431 	need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
432 					     range->flags);
433 	/* we've to flush the tlb before the pages can be freed */
434 	if (need_tlb_flush || kvm->tlbs_dirty)
435 		kvm_flush_remote_tlbs(kvm);
436 
437 	spin_unlock(&kvm->mmu_lock);
438 	srcu_read_unlock(&kvm->srcu, idx);
439 
440 	return 0;
441 }
442 
kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier * mn,const struct mmu_notifier_range * range)443 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
444 					const struct mmu_notifier_range *range)
445 {
446 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
447 
448 	spin_lock(&kvm->mmu_lock);
449 	/*
450 	 * This sequence increase will notify the kvm page fault that
451 	 * the page that is going to be mapped in the spte could have
452 	 * been freed.
453 	 */
454 	kvm->mmu_notifier_seq++;
455 	smp_wmb();
456 	/*
457 	 * The above sequence increase must be visible before the
458 	 * below count decrease, which is ensured by the smp_wmb above
459 	 * in conjunction with the smp_rmb in mmu_notifier_retry().
460 	 */
461 	kvm->mmu_notifier_count--;
462 	spin_unlock(&kvm->mmu_lock);
463 
464 	BUG_ON(kvm->mmu_notifier_count < 0);
465 }
466 
kvm_mmu_notifier_clear_flush_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)467 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
468 					      struct mm_struct *mm,
469 					      unsigned long start,
470 					      unsigned long end)
471 {
472 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
473 	int young, idx;
474 
475 	idx = srcu_read_lock(&kvm->srcu);
476 	spin_lock(&kvm->mmu_lock);
477 
478 	young = kvm_age_hva(kvm, start, end);
479 	if (young)
480 		kvm_flush_remote_tlbs(kvm);
481 
482 	spin_unlock(&kvm->mmu_lock);
483 	srcu_read_unlock(&kvm->srcu, idx);
484 
485 	return young;
486 }
487 
kvm_mmu_notifier_clear_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)488 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
489 					struct mm_struct *mm,
490 					unsigned long start,
491 					unsigned long end)
492 {
493 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
494 	int young, idx;
495 
496 	idx = srcu_read_lock(&kvm->srcu);
497 	spin_lock(&kvm->mmu_lock);
498 	/*
499 	 * Even though we do not flush TLB, this will still adversely
500 	 * affect performance on pre-Haswell Intel EPT, where there is
501 	 * no EPT Access Bit to clear so that we have to tear down EPT
502 	 * tables instead. If we find this unacceptable, we can always
503 	 * add a parameter to kvm_age_hva so that it effectively doesn't
504 	 * do anything on clear_young.
505 	 *
506 	 * Also note that currently we never issue secondary TLB flushes
507 	 * from clear_young, leaving this job up to the regular system
508 	 * cadence. If we find this inaccurate, we might come up with a
509 	 * more sophisticated heuristic later.
510 	 */
511 	young = kvm_age_hva(kvm, start, end);
512 	spin_unlock(&kvm->mmu_lock);
513 	srcu_read_unlock(&kvm->srcu, idx);
514 
515 	return young;
516 }
517 
kvm_mmu_notifier_test_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long address)518 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
519 				       struct mm_struct *mm,
520 				       unsigned long address)
521 {
522 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
523 	int young, idx;
524 
525 	idx = srcu_read_lock(&kvm->srcu);
526 	spin_lock(&kvm->mmu_lock);
527 	young = kvm_test_age_hva(kvm, address);
528 	spin_unlock(&kvm->mmu_lock);
529 	srcu_read_unlock(&kvm->srcu, idx);
530 
531 	return young;
532 }
533 
kvm_mmu_notifier_release(struct mmu_notifier * mn,struct mm_struct * mm)534 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
535 				     struct mm_struct *mm)
536 {
537 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
538 	int idx;
539 
540 	idx = srcu_read_lock(&kvm->srcu);
541 	kvm_arch_flush_shadow_all(kvm);
542 	srcu_read_unlock(&kvm->srcu, idx);
543 }
544 
545 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
546 	.invalidate_range	= kvm_mmu_notifier_invalidate_range,
547 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
548 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
549 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
550 	.clear_young		= kvm_mmu_notifier_clear_young,
551 	.test_young		= kvm_mmu_notifier_test_young,
552 	.change_pte		= kvm_mmu_notifier_change_pte,
553 	.release		= kvm_mmu_notifier_release,
554 };
555 
kvm_init_mmu_notifier(struct kvm * kvm)556 static int kvm_init_mmu_notifier(struct kvm *kvm)
557 {
558 	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
559 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
560 }
561 
562 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
563 
kvm_init_mmu_notifier(struct kvm * kvm)564 static int kvm_init_mmu_notifier(struct kvm *kvm)
565 {
566 	return 0;
567 }
568 
569 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
570 
kvm_alloc_memslots(void)571 static struct kvm_memslots *kvm_alloc_memslots(void)
572 {
573 	int i;
574 	struct kvm_memslots *slots;
575 
576 	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
577 	if (!slots)
578 		return NULL;
579 
580 	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
581 		slots->id_to_index[i] = slots->memslots[i].id = i;
582 
583 	return slots;
584 }
585 
kvm_destroy_dirty_bitmap(struct kvm_memory_slot * memslot)586 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
587 {
588 	if (!memslot->dirty_bitmap)
589 		return;
590 
591 	kvfree(memslot->dirty_bitmap);
592 	memslot->dirty_bitmap = NULL;
593 }
594 
595 /*
596  * Free any memory in @free but not in @dont.
597  */
kvm_free_memslot(struct kvm * kvm,struct kvm_memory_slot * free,struct kvm_memory_slot * dont)598 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
599 			      struct kvm_memory_slot *dont)
600 {
601 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
602 		kvm_destroy_dirty_bitmap(free);
603 
604 	kvm_arch_free_memslot(kvm, free, dont);
605 
606 	free->npages = 0;
607 }
608 
kvm_free_memslots(struct kvm * kvm,struct kvm_memslots * slots)609 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
610 {
611 	struct kvm_memory_slot *memslot;
612 
613 	if (!slots)
614 		return;
615 
616 	kvm_for_each_memslot(memslot, slots)
617 		kvm_free_memslot(kvm, memslot, NULL);
618 
619 	kvfree(slots);
620 }
621 
kvm_destroy_vm_debugfs(struct kvm * kvm)622 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
623 {
624 	int i;
625 
626 	if (!kvm->debugfs_dentry)
627 		return;
628 
629 	debugfs_remove_recursive(kvm->debugfs_dentry);
630 
631 	if (kvm->debugfs_stat_data) {
632 		for (i = 0; i < kvm_debugfs_num_entries; i++)
633 			kfree(kvm->debugfs_stat_data[i]);
634 		kfree(kvm->debugfs_stat_data);
635 	}
636 }
637 
kvm_create_vm_debugfs(struct kvm * kvm,int fd)638 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
639 {
640 	static DEFINE_MUTEX(kvm_debugfs_lock);
641 	struct dentry *dent;
642 	char dir_name[ITOA_MAX_LEN * 2];
643 	struct kvm_stat_data *stat_data;
644 	struct kvm_stats_debugfs_item *p;
645 
646 	if (!debugfs_initialized())
647 		return 0;
648 
649 	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
650 	mutex_lock(&kvm_debugfs_lock);
651 	dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
652 	if (dent) {
653 		pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
654 		dput(dent);
655 		mutex_unlock(&kvm_debugfs_lock);
656 		return 0;
657 	}
658 	dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
659 	mutex_unlock(&kvm_debugfs_lock);
660 	if (IS_ERR(dent))
661 		return 0;
662 
663 	kvm->debugfs_dentry = dent;
664 	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
665 					 sizeof(*kvm->debugfs_stat_data),
666 					 GFP_KERNEL_ACCOUNT);
667 	if (!kvm->debugfs_stat_data)
668 		return -ENOMEM;
669 
670 	for (p = debugfs_entries; p->name; p++) {
671 		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
672 		if (!stat_data)
673 			return -ENOMEM;
674 
675 		stat_data->kvm = kvm;
676 		stat_data->offset = p->offset;
677 		stat_data->mode = p->mode ? p->mode : 0644;
678 		kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
679 		debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
680 				    stat_data, stat_fops_per_vm[p->kind]);
681 	}
682 	return 0;
683 }
684 
685 /*
686  * Called after the VM is otherwise initialized, but just before adding it to
687  * the vm_list.
688  */
kvm_arch_post_init_vm(struct kvm * kvm)689 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
690 {
691 	return 0;
692 }
693 
694 /*
695  * Called just after removing the VM from the vm_list, but before doing any
696  * other destruction.
697  */
kvm_arch_pre_destroy_vm(struct kvm * kvm)698 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
699 {
700 }
701 
kvm_create_vm(unsigned long type)702 static struct kvm *kvm_create_vm(unsigned long type)
703 {
704 	struct kvm *kvm = kvm_arch_alloc_vm();
705 	int r = -ENOMEM;
706 	int i;
707 
708 	if (!kvm)
709 		return ERR_PTR(-ENOMEM);
710 
711 	spin_lock_init(&kvm->mmu_lock);
712 	mmgrab(current->mm);
713 	kvm->mm = current->mm;
714 	kvm_eventfd_init(kvm);
715 	mutex_init(&kvm->lock);
716 	mutex_init(&kvm->irq_lock);
717 	mutex_init(&kvm->slots_lock);
718 	INIT_LIST_HEAD(&kvm->devices);
719 
720 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
721 
722 	if (init_srcu_struct(&kvm->srcu))
723 		goto out_err_no_srcu;
724 	if (init_srcu_struct(&kvm->irq_srcu))
725 		goto out_err_no_irq_srcu;
726 
727 	refcount_set(&kvm->users_count, 1);
728 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
729 		struct kvm_memslots *slots = kvm_alloc_memslots();
730 
731 		if (!slots)
732 			goto out_err_no_arch_destroy_vm;
733 		/* Generations must be different for each address space. */
734 		slots->generation = i;
735 		rcu_assign_pointer(kvm->memslots[i], slots);
736 	}
737 
738 	for (i = 0; i < KVM_NR_BUSES; i++) {
739 		rcu_assign_pointer(kvm->buses[i],
740 			kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
741 		if (!kvm->buses[i])
742 			goto out_err_no_arch_destroy_vm;
743 	}
744 
745 	r = kvm_arch_init_vm(kvm, type);
746 	if (r)
747 		goto out_err_no_arch_destroy_vm;
748 
749 	r = hardware_enable_all();
750 	if (r)
751 		goto out_err_no_disable;
752 
753 #ifdef CONFIG_HAVE_KVM_IRQFD
754 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
755 #endif
756 
757 	r = kvm_init_mmu_notifier(kvm);
758 	if (r)
759 		goto out_err_no_mmu_notifier;
760 
761 	r = kvm_arch_post_init_vm(kvm);
762 	if (r)
763 		goto out_err;
764 
765 	mutex_lock(&kvm_lock);
766 	list_add(&kvm->vm_list, &vm_list);
767 	mutex_unlock(&kvm_lock);
768 
769 	preempt_notifier_inc();
770 
771 	/*
772 	 * When the fd passed to this ioctl() is opened it pins the module,
773 	 * but try_module_get() also prevents getting a reference if the module
774 	 * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait").
775 	 */
776 	if (!try_module_get(kvm_chardev_ops.owner)) {
777 		r = -ENODEV;
778 		goto out_err;
779 	}
780 
781 	return kvm;
782 
783 out_err:
784 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
785 	if (kvm->mmu_notifier.ops)
786 		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
787 #endif
788 out_err_no_mmu_notifier:
789 	hardware_disable_all();
790 out_err_no_disable:
791 	kvm_arch_destroy_vm(kvm);
792 out_err_no_arch_destroy_vm:
793 	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
794 	for (i = 0; i < KVM_NR_BUSES; i++)
795 		kfree(kvm_get_bus(kvm, i));
796 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
797 		kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
798 	cleanup_srcu_struct(&kvm->irq_srcu);
799 out_err_no_irq_srcu:
800 	cleanup_srcu_struct(&kvm->srcu);
801 out_err_no_srcu:
802 	kvm_arch_free_vm(kvm);
803 	mmdrop(current->mm);
804 	return ERR_PTR(r);
805 }
806 
kvm_destroy_devices(struct kvm * kvm)807 static void kvm_destroy_devices(struct kvm *kvm)
808 {
809 	struct kvm_device *dev, *tmp;
810 
811 	/*
812 	 * We do not need to take the kvm->lock here, because nobody else
813 	 * has a reference to the struct kvm at this point and therefore
814 	 * cannot access the devices list anyhow.
815 	 */
816 	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
817 		list_del(&dev->vm_node);
818 		dev->ops->destroy(dev);
819 	}
820 }
821 
kvm_destroy_vm(struct kvm * kvm)822 static void kvm_destroy_vm(struct kvm *kvm)
823 {
824 	int i;
825 	struct mm_struct *mm = kvm->mm;
826 
827 	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
828 	kvm_destroy_vm_debugfs(kvm);
829 	kvm_arch_sync_events(kvm);
830 	mutex_lock(&kvm_lock);
831 	list_del(&kvm->vm_list);
832 	mutex_unlock(&kvm_lock);
833 	kvm_arch_pre_destroy_vm(kvm);
834 
835 	kvm_free_irq_routing(kvm);
836 	for (i = 0; i < KVM_NR_BUSES; i++) {
837 		struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
838 
839 		if (bus)
840 			kvm_io_bus_destroy(bus);
841 		kvm->buses[i] = NULL;
842 	}
843 	kvm_coalesced_mmio_free(kvm);
844 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
845 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
846 #else
847 	kvm_arch_flush_shadow_all(kvm);
848 #endif
849 	kvm_arch_destroy_vm(kvm);
850 	kvm_destroy_devices(kvm);
851 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
852 		kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
853 	cleanup_srcu_struct(&kvm->irq_srcu);
854 	cleanup_srcu_struct(&kvm->srcu);
855 	kvm_arch_free_vm(kvm);
856 	preempt_notifier_dec();
857 	hardware_disable_all();
858 	mmdrop(mm);
859 	module_put(kvm_chardev_ops.owner);
860 }
861 
kvm_get_kvm(struct kvm * kvm)862 void kvm_get_kvm(struct kvm *kvm)
863 {
864 	refcount_inc(&kvm->users_count);
865 }
866 EXPORT_SYMBOL_GPL(kvm_get_kvm);
867 
kvm_put_kvm(struct kvm * kvm)868 void kvm_put_kvm(struct kvm *kvm)
869 {
870 	if (refcount_dec_and_test(&kvm->users_count))
871 		kvm_destroy_vm(kvm);
872 }
873 EXPORT_SYMBOL_GPL(kvm_put_kvm);
874 
875 
kvm_vm_release(struct inode * inode,struct file * filp)876 static int kvm_vm_release(struct inode *inode, struct file *filp)
877 {
878 	struct kvm *kvm = filp->private_data;
879 
880 	kvm_irqfd_release(kvm);
881 
882 	kvm_put_kvm(kvm);
883 	return 0;
884 }
885 
886 /*
887  * Allocation size is twice as large as the actual dirty bitmap size.
888  * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
889  */
kvm_create_dirty_bitmap(struct kvm_memory_slot * memslot)890 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
891 {
892 	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
893 
894 	memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
895 	if (!memslot->dirty_bitmap)
896 		return -ENOMEM;
897 
898 	return 0;
899 }
900 
901 /*
902  * Insert memslot and re-sort memslots based on their GFN,
903  * so binary search could be used to lookup GFN.
904  * Sorting algorithm takes advantage of having initially
905  * sorted array and known changed memslot position.
906  */
update_memslots(struct kvm_memslots * slots,struct kvm_memory_slot * new,enum kvm_mr_change change)907 static void update_memslots(struct kvm_memslots *slots,
908 			    struct kvm_memory_slot *new,
909 			    enum kvm_mr_change change)
910 {
911 	int id = new->id;
912 	int i = slots->id_to_index[id];
913 	struct kvm_memory_slot *mslots = slots->memslots;
914 
915 	WARN_ON(mslots[i].id != id);
916 	switch (change) {
917 	case KVM_MR_CREATE:
918 		slots->used_slots++;
919 		WARN_ON(mslots[i].npages || !new->npages);
920 		break;
921 	case KVM_MR_DELETE:
922 		slots->used_slots--;
923 		WARN_ON(new->npages || !mslots[i].npages);
924 		break;
925 	default:
926 		break;
927 	}
928 
929 	while (i < KVM_MEM_SLOTS_NUM - 1 &&
930 	       new->base_gfn <= mslots[i + 1].base_gfn) {
931 		if (!mslots[i + 1].npages)
932 			break;
933 		mslots[i] = mslots[i + 1];
934 		slots->id_to_index[mslots[i].id] = i;
935 		i++;
936 	}
937 
938 	/*
939 	 * The ">=" is needed when creating a slot with base_gfn == 0,
940 	 * so that it moves before all those with base_gfn == npages == 0.
941 	 *
942 	 * On the other hand, if new->npages is zero, the above loop has
943 	 * already left i pointing to the beginning of the empty part of
944 	 * mslots, and the ">=" would move the hole backwards in this
945 	 * case---which is wrong.  So skip the loop when deleting a slot.
946 	 */
947 	if (new->npages) {
948 		while (i > 0 &&
949 		       new->base_gfn >= mslots[i - 1].base_gfn) {
950 			mslots[i] = mslots[i - 1];
951 			slots->id_to_index[mslots[i].id] = i;
952 			i--;
953 		}
954 	} else
955 		WARN_ON_ONCE(i != slots->used_slots);
956 
957 	mslots[i] = *new;
958 	slots->id_to_index[mslots[i].id] = i;
959 }
960 
check_memory_region_flags(const struct kvm_userspace_memory_region * mem)961 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
962 {
963 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
964 
965 #ifdef __KVM_HAVE_READONLY_MEM
966 	valid_flags |= KVM_MEM_READONLY;
967 #endif
968 
969 	if (mem->flags & ~valid_flags)
970 		return -EINVAL;
971 
972 	return 0;
973 }
974 
install_new_memslots(struct kvm * kvm,int as_id,struct kvm_memslots * slots)975 static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
976 		int as_id, struct kvm_memslots *slots)
977 {
978 	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
979 	u64 gen = old_memslots->generation;
980 
981 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
982 	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
983 
984 	rcu_assign_pointer(kvm->memslots[as_id], slots);
985 	synchronize_srcu_expedited(&kvm->srcu);
986 
987 	/*
988 	 * Increment the new memslot generation a second time, dropping the
989 	 * update in-progress flag and incrementing then generation based on
990 	 * the number of address spaces.  This provides a unique and easily
991 	 * identifiable generation number while the memslots are in flux.
992 	 */
993 	gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
994 
995 	/*
996 	 * Generations must be unique even across address spaces.  We do not need
997 	 * a global counter for that, instead the generation space is evenly split
998 	 * across address spaces.  For example, with two address spaces, address
999 	 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1000 	 * use generations 1, 3, 5, ...
1001 	 */
1002 	gen += KVM_ADDRESS_SPACE_NUM;
1003 
1004 	kvm_arch_memslots_updated(kvm, gen);
1005 
1006 	slots->generation = gen;
1007 
1008 	return old_memslots;
1009 }
1010 
1011 /*
1012  * Allocate some memory and give it an address in the guest physical address
1013  * space.
1014  *
1015  * Discontiguous memory is allowed, mostly for framebuffers.
1016  *
1017  * Must be called holding kvm->slots_lock for write.
1018  */
__kvm_set_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region * mem)1019 int __kvm_set_memory_region(struct kvm *kvm,
1020 			    const struct kvm_userspace_memory_region *mem)
1021 {
1022 	int r;
1023 	gfn_t base_gfn;
1024 	unsigned long npages;
1025 	struct kvm_memory_slot *slot;
1026 	struct kvm_memory_slot old, new;
1027 	struct kvm_memslots *slots = NULL, *old_memslots;
1028 	int as_id, id;
1029 	enum kvm_mr_change change;
1030 
1031 	r = check_memory_region_flags(mem);
1032 	if (r)
1033 		goto out;
1034 
1035 	r = -EINVAL;
1036 	as_id = mem->slot >> 16;
1037 	id = (u16)mem->slot;
1038 
1039 	/* General sanity checks */
1040 	if (mem->memory_size & (PAGE_SIZE - 1))
1041 		goto out;
1042 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1043 		goto out;
1044 	/* We can read the guest memory with __xxx_user() later on. */
1045 	if ((id < KVM_USER_MEM_SLOTS) &&
1046 	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1047 	     (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1048 	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1049 			mem->memory_size)))
1050 		goto out;
1051 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1052 		goto out;
1053 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1054 		goto out;
1055 
1056 	slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1057 	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1058 	npages = mem->memory_size >> PAGE_SHIFT;
1059 
1060 	if (npages > KVM_MEM_MAX_NR_PAGES)
1061 		goto out;
1062 
1063 	new = old = *slot;
1064 
1065 	new.id = id;
1066 	new.base_gfn = base_gfn;
1067 	new.npages = npages;
1068 	new.flags = mem->flags;
1069 
1070 	if (npages) {
1071 		if (!old.npages)
1072 			change = KVM_MR_CREATE;
1073 		else { /* Modify an existing slot. */
1074 			if ((mem->userspace_addr != old.userspace_addr) ||
1075 			    (npages != old.npages) ||
1076 			    ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1077 				goto out;
1078 
1079 			if (base_gfn != old.base_gfn)
1080 				change = KVM_MR_MOVE;
1081 			else if (new.flags != old.flags)
1082 				change = KVM_MR_FLAGS_ONLY;
1083 			else { /* Nothing to change. */
1084 				r = 0;
1085 				goto out;
1086 			}
1087 		}
1088 	} else {
1089 		if (!old.npages)
1090 			goto out;
1091 
1092 		change = KVM_MR_DELETE;
1093 		new.base_gfn = 0;
1094 		new.flags = 0;
1095 	}
1096 
1097 	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1098 		/* Check for overlaps */
1099 		r = -EEXIST;
1100 		kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
1101 			if (slot->id == id)
1102 				continue;
1103 			if (!((base_gfn + npages <= slot->base_gfn) ||
1104 			      (base_gfn >= slot->base_gfn + slot->npages)))
1105 				goto out;
1106 		}
1107 	}
1108 
1109 	/* Free page dirty bitmap if unneeded */
1110 	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1111 		new.dirty_bitmap = NULL;
1112 
1113 	r = -ENOMEM;
1114 	if (change == KVM_MR_CREATE) {
1115 		new.userspace_addr = mem->userspace_addr;
1116 
1117 		if (kvm_arch_create_memslot(kvm, &new, npages))
1118 			goto out_free;
1119 	}
1120 
1121 	/* Allocate page dirty bitmap if needed */
1122 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1123 		if (kvm_create_dirty_bitmap(&new) < 0)
1124 			goto out_free;
1125 	}
1126 
1127 	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
1128 	if (!slots)
1129 		goto out_free;
1130 	memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
1131 
1132 	if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
1133 		slot = id_to_memslot(slots, id);
1134 		slot->flags |= KVM_MEMSLOT_INVALID;
1135 
1136 		old_memslots = install_new_memslots(kvm, as_id, slots);
1137 
1138 		/* From this point no new shadow pages pointing to a deleted,
1139 		 * or moved, memslot will be created.
1140 		 *
1141 		 * validation of sp->gfn happens in:
1142 		 *	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1143 		 *	- kvm_is_visible_gfn (mmu_check_roots)
1144 		 */
1145 		kvm_arch_flush_shadow_memslot(kvm, slot);
1146 
1147 		/*
1148 		 * We can re-use the old_memslots from above, the only difference
1149 		 * from the currently installed memslots is the invalid flag.  This
1150 		 * will get overwritten by update_memslots anyway.
1151 		 */
1152 		slots = old_memslots;
1153 	}
1154 
1155 	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
1156 	if (r)
1157 		goto out_slots;
1158 
1159 	/* actual memory is freed via old in kvm_free_memslot below */
1160 	if (change == KVM_MR_DELETE) {
1161 		new.dirty_bitmap = NULL;
1162 		memset(&new.arch, 0, sizeof(new.arch));
1163 	}
1164 
1165 	update_memslots(slots, &new, change);
1166 	old_memslots = install_new_memslots(kvm, as_id, slots);
1167 
1168 	kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
1169 
1170 	kvm_free_memslot(kvm, &old, &new);
1171 	kvfree(old_memslots);
1172 	return 0;
1173 
1174 out_slots:
1175 	kvfree(slots);
1176 out_free:
1177 	kvm_free_memslot(kvm, &new, &old);
1178 out:
1179 	return r;
1180 }
1181 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1182 
kvm_set_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region * mem)1183 int kvm_set_memory_region(struct kvm *kvm,
1184 			  const struct kvm_userspace_memory_region *mem)
1185 {
1186 	int r;
1187 
1188 	mutex_lock(&kvm->slots_lock);
1189 	r = __kvm_set_memory_region(kvm, mem);
1190 	mutex_unlock(&kvm->slots_lock);
1191 	return r;
1192 }
1193 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1194 
kvm_vm_ioctl_set_memory_region(struct kvm * kvm,struct kvm_userspace_memory_region * mem)1195 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1196 					  struct kvm_userspace_memory_region *mem)
1197 {
1198 	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1199 		return -EINVAL;
1200 
1201 	return kvm_set_memory_region(kvm, mem);
1202 }
1203 
kvm_get_dirty_log(struct kvm * kvm,struct kvm_dirty_log * log,int * is_dirty)1204 int kvm_get_dirty_log(struct kvm *kvm,
1205 			struct kvm_dirty_log *log, int *is_dirty)
1206 {
1207 	struct kvm_memslots *slots;
1208 	struct kvm_memory_slot *memslot;
1209 	int i, as_id, id;
1210 	unsigned long n;
1211 	unsigned long any = 0;
1212 
1213 	as_id = log->slot >> 16;
1214 	id = (u16)log->slot;
1215 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1216 		return -EINVAL;
1217 
1218 	slots = __kvm_memslots(kvm, as_id);
1219 	memslot = id_to_memslot(slots, id);
1220 	if (!memslot->dirty_bitmap)
1221 		return -ENOENT;
1222 
1223 	n = kvm_dirty_bitmap_bytes(memslot);
1224 
1225 	for (i = 0; !any && i < n/sizeof(long); ++i)
1226 		any = memslot->dirty_bitmap[i];
1227 
1228 	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1229 		return -EFAULT;
1230 
1231 	if (any)
1232 		*is_dirty = 1;
1233 	return 0;
1234 }
1235 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1236 
1237 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1238 /**
1239  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
1240  *	and reenable dirty page tracking for the corresponding pages.
1241  * @kvm:	pointer to kvm instance
1242  * @log:	slot id and address to which we copy the log
1243  * @flush:	true if TLB flush is needed by caller
1244  *
1245  * We need to keep it in mind that VCPU threads can write to the bitmap
1246  * concurrently. So, to avoid losing track of dirty pages we keep the
1247  * following order:
1248  *
1249  *    1. Take a snapshot of the bit and clear it if needed.
1250  *    2. Write protect the corresponding page.
1251  *    3. Copy the snapshot to the userspace.
1252  *    4. Upon return caller flushes TLB's if needed.
1253  *
1254  * Between 2 and 4, the guest may write to the page using the remaining TLB
1255  * entry.  This is not a problem because the page is reported dirty using
1256  * the snapshot taken before and step 4 ensures that writes done after
1257  * exiting to userspace will be logged for the next call.
1258  *
1259  */
kvm_get_dirty_log_protect(struct kvm * kvm,struct kvm_dirty_log * log,bool * flush)1260 int kvm_get_dirty_log_protect(struct kvm *kvm,
1261 			struct kvm_dirty_log *log, bool *flush)
1262 {
1263 	struct kvm_memslots *slots;
1264 	struct kvm_memory_slot *memslot;
1265 	int i, as_id, id;
1266 	unsigned long n;
1267 	unsigned long *dirty_bitmap;
1268 	unsigned long *dirty_bitmap_buffer;
1269 
1270 	as_id = log->slot >> 16;
1271 	id = (u16)log->slot;
1272 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1273 		return -EINVAL;
1274 
1275 	slots = __kvm_memslots(kvm, as_id);
1276 	memslot = id_to_memslot(slots, id);
1277 
1278 	dirty_bitmap = memslot->dirty_bitmap;
1279 	if (!dirty_bitmap)
1280 		return -ENOENT;
1281 
1282 	n = kvm_dirty_bitmap_bytes(memslot);
1283 	*flush = false;
1284 	if (kvm->manual_dirty_log_protect) {
1285 		/*
1286 		 * Unlike kvm_get_dirty_log, we always return false in *flush,
1287 		 * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
1288 		 * is some code duplication between this function and
1289 		 * kvm_get_dirty_log, but hopefully all architecture
1290 		 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
1291 		 * can be eliminated.
1292 		 */
1293 		dirty_bitmap_buffer = dirty_bitmap;
1294 	} else {
1295 		dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1296 		memset(dirty_bitmap_buffer, 0, n);
1297 
1298 		spin_lock(&kvm->mmu_lock);
1299 		for (i = 0; i < n / sizeof(long); i++) {
1300 			unsigned long mask;
1301 			gfn_t offset;
1302 
1303 			if (!dirty_bitmap[i])
1304 				continue;
1305 
1306 			*flush = true;
1307 			mask = xchg(&dirty_bitmap[i], 0);
1308 			dirty_bitmap_buffer[i] = mask;
1309 
1310 			offset = i * BITS_PER_LONG;
1311 			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1312 								offset, mask);
1313 		}
1314 		spin_unlock(&kvm->mmu_lock);
1315 	}
1316 
1317 	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1318 		return -EFAULT;
1319 	return 0;
1320 }
1321 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1322 
1323 /**
1324  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
1325  *	and reenable dirty page tracking for the corresponding pages.
1326  * @kvm:	pointer to kvm instance
1327  * @log:	slot id and address from which to fetch the bitmap of dirty pages
1328  * @flush:	true if TLB flush is needed by caller
1329  */
kvm_clear_dirty_log_protect(struct kvm * kvm,struct kvm_clear_dirty_log * log,bool * flush)1330 int kvm_clear_dirty_log_protect(struct kvm *kvm,
1331 				struct kvm_clear_dirty_log *log, bool *flush)
1332 {
1333 	struct kvm_memslots *slots;
1334 	struct kvm_memory_slot *memslot;
1335 	int as_id, id;
1336 	gfn_t offset;
1337 	unsigned long i, n;
1338 	unsigned long *dirty_bitmap;
1339 	unsigned long *dirty_bitmap_buffer;
1340 
1341 	as_id = log->slot >> 16;
1342 	id = (u16)log->slot;
1343 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1344 		return -EINVAL;
1345 
1346 	if (log->first_page & 63)
1347 		return -EINVAL;
1348 
1349 	slots = __kvm_memslots(kvm, as_id);
1350 	memslot = id_to_memslot(slots, id);
1351 
1352 	dirty_bitmap = memslot->dirty_bitmap;
1353 	if (!dirty_bitmap)
1354 		return -ENOENT;
1355 
1356 	n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
1357 
1358 	if (log->first_page > memslot->npages ||
1359 	    log->num_pages > memslot->npages - log->first_page ||
1360 	    (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
1361 	    return -EINVAL;
1362 
1363 	*flush = false;
1364 	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1365 	if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
1366 		return -EFAULT;
1367 
1368 	spin_lock(&kvm->mmu_lock);
1369 	for (offset = log->first_page, i = offset / BITS_PER_LONG,
1370 		 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
1371 	     i++, offset += BITS_PER_LONG) {
1372 		unsigned long mask = *dirty_bitmap_buffer++;
1373 		atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
1374 		if (!mask)
1375 			continue;
1376 
1377 		mask &= atomic_long_fetch_andnot(mask, p);
1378 
1379 		/*
1380 		 * mask contains the bits that really have been cleared.  This
1381 		 * never includes any bits beyond the length of the memslot (if
1382 		 * the length is not aligned to 64 pages), therefore it is not
1383 		 * a problem if userspace sets them in log->dirty_bitmap.
1384 		*/
1385 		if (mask) {
1386 			*flush = true;
1387 			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1388 								offset, mask);
1389 		}
1390 	}
1391 	spin_unlock(&kvm->mmu_lock);
1392 
1393 	return 0;
1394 }
1395 EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
1396 #endif
1397 
kvm_largepages_enabled(void)1398 bool kvm_largepages_enabled(void)
1399 {
1400 	return largepages_enabled;
1401 }
1402 
kvm_disable_largepages(void)1403 void kvm_disable_largepages(void)
1404 {
1405 	largepages_enabled = false;
1406 }
1407 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
1408 
gfn_to_memslot(struct kvm * kvm,gfn_t gfn)1409 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1410 {
1411 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
1412 }
1413 EXPORT_SYMBOL_GPL(gfn_to_memslot);
1414 
kvm_vcpu_gfn_to_memslot(struct kvm_vcpu * vcpu,gfn_t gfn)1415 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
1416 {
1417 	return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
1418 }
1419 
kvm_is_visible_gfn(struct kvm * kvm,gfn_t gfn)1420 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
1421 {
1422 	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
1423 
1424 	if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
1425 	      memslot->flags & KVM_MEMSLOT_INVALID)
1426 		return false;
1427 
1428 	return true;
1429 }
1430 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
1431 
kvm_host_page_size(struct kvm_vcpu * vcpu,gfn_t gfn)1432 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
1433 {
1434 	struct vm_area_struct *vma;
1435 	unsigned long addr, size;
1436 
1437 	size = PAGE_SIZE;
1438 
1439 	addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
1440 	if (kvm_is_error_hva(addr))
1441 		return PAGE_SIZE;
1442 
1443 	down_read(&current->mm->mmap_sem);
1444 	vma = find_vma(current->mm, addr);
1445 	if (!vma)
1446 		goto out;
1447 
1448 	size = vma_kernel_pagesize(vma);
1449 
1450 out:
1451 	up_read(&current->mm->mmap_sem);
1452 
1453 	return size;
1454 }
1455 
memslot_is_readonly(struct kvm_memory_slot * slot)1456 static bool memslot_is_readonly(struct kvm_memory_slot *slot)
1457 {
1458 	return slot->flags & KVM_MEM_READONLY;
1459 }
1460 
__gfn_to_hva_many(struct kvm_memory_slot * slot,gfn_t gfn,gfn_t * nr_pages,bool write)1461 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1462 				       gfn_t *nr_pages, bool write)
1463 {
1464 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1465 		return KVM_HVA_ERR_BAD;
1466 
1467 	if (memslot_is_readonly(slot) && write)
1468 		return KVM_HVA_ERR_RO_BAD;
1469 
1470 	if (nr_pages)
1471 		*nr_pages = slot->npages - (gfn - slot->base_gfn);
1472 
1473 	return __gfn_to_hva_memslot(slot, gfn);
1474 }
1475 
gfn_to_hva_many(struct kvm_memory_slot * slot,gfn_t gfn,gfn_t * nr_pages)1476 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1477 				     gfn_t *nr_pages)
1478 {
1479 	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
1480 }
1481 
gfn_to_hva_memslot(struct kvm_memory_slot * slot,gfn_t gfn)1482 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
1483 					gfn_t gfn)
1484 {
1485 	return gfn_to_hva_many(slot, gfn, NULL);
1486 }
1487 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
1488 
gfn_to_hva(struct kvm * kvm,gfn_t gfn)1489 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1490 {
1491 	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1492 }
1493 EXPORT_SYMBOL_GPL(gfn_to_hva);
1494 
kvm_vcpu_gfn_to_hva(struct kvm_vcpu * vcpu,gfn_t gfn)1495 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
1496 {
1497 	return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
1498 }
1499 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
1500 
1501 /*
1502  * Return the hva of a @gfn and the R/W attribute if possible.
1503  *
1504  * @slot: the kvm_memory_slot which contains @gfn
1505  * @gfn: the gfn to be translated
1506  * @writable: used to return the read/write attribute of the @slot if the hva
1507  * is valid and @writable is not NULL
1508  */
gfn_to_hva_memslot_prot(struct kvm_memory_slot * slot,gfn_t gfn,bool * writable)1509 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
1510 				      gfn_t gfn, bool *writable)
1511 {
1512 	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
1513 
1514 	if (!kvm_is_error_hva(hva) && writable)
1515 		*writable = !memslot_is_readonly(slot);
1516 
1517 	return hva;
1518 }
1519 
gfn_to_hva_prot(struct kvm * kvm,gfn_t gfn,bool * writable)1520 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
1521 {
1522 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1523 
1524 	return gfn_to_hva_memslot_prot(slot, gfn, writable);
1525 }
1526 
kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu * vcpu,gfn_t gfn,bool * writable)1527 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
1528 {
1529 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1530 
1531 	return gfn_to_hva_memslot_prot(slot, gfn, writable);
1532 }
1533 
check_user_page_hwpoison(unsigned long addr)1534 static inline int check_user_page_hwpoison(unsigned long addr)
1535 {
1536 	int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
1537 
1538 	rc = get_user_pages(addr, 1, flags, NULL, NULL);
1539 	return rc == -EHWPOISON;
1540 }
1541 
1542 /*
1543  * The fast path to get the writable pfn which will be stored in @pfn,
1544  * true indicates success, otherwise false is returned.  It's also the
1545  * only part that runs if we can are in atomic context.
1546  */
hva_to_pfn_fast(unsigned long addr,bool write_fault,bool * writable,kvm_pfn_t * pfn)1547 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
1548 			    bool *writable, kvm_pfn_t *pfn)
1549 {
1550 	struct page *page[1];
1551 	int npages;
1552 
1553 	/*
1554 	 * Fast pin a writable pfn only if it is a write fault request
1555 	 * or the caller allows to map a writable pfn for a read fault
1556 	 * request.
1557 	 */
1558 	if (!(write_fault || writable))
1559 		return false;
1560 
1561 	npages = __get_user_pages_fast(addr, 1, 1, page);
1562 	if (npages == 1) {
1563 		*pfn = page_to_pfn(page[0]);
1564 
1565 		if (writable)
1566 			*writable = true;
1567 		return true;
1568 	}
1569 
1570 	return false;
1571 }
1572 
1573 /*
1574  * The slow path to get the pfn of the specified host virtual address,
1575  * 1 indicates success, -errno is returned if error is detected.
1576  */
hva_to_pfn_slow(unsigned long addr,bool * async,bool write_fault,bool * writable,kvm_pfn_t * pfn)1577 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1578 			   bool *writable, kvm_pfn_t *pfn)
1579 {
1580 	unsigned int flags = FOLL_HWPOISON;
1581 	struct page *page;
1582 	int npages = 0;
1583 
1584 	might_sleep();
1585 
1586 	if (writable)
1587 		*writable = write_fault;
1588 
1589 	if (write_fault)
1590 		flags |= FOLL_WRITE;
1591 	if (async)
1592 		flags |= FOLL_NOWAIT;
1593 
1594 	npages = get_user_pages_unlocked(addr, 1, &page, flags);
1595 	if (npages != 1)
1596 		return npages;
1597 
1598 	/* map read fault as writable if possible */
1599 	if (unlikely(!write_fault) && writable) {
1600 		struct page *wpage;
1601 
1602 		if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) {
1603 			*writable = true;
1604 			put_page(page);
1605 			page = wpage;
1606 		}
1607 	}
1608 	*pfn = page_to_pfn(page);
1609 	return npages;
1610 }
1611 
vma_is_valid(struct vm_area_struct * vma,bool write_fault)1612 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
1613 {
1614 	if (unlikely(!(vma->vm_flags & VM_READ)))
1615 		return false;
1616 
1617 	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
1618 		return false;
1619 
1620 	return true;
1621 }
1622 
kvm_try_get_pfn(kvm_pfn_t pfn)1623 static int kvm_try_get_pfn(kvm_pfn_t pfn)
1624 {
1625 	if (kvm_is_reserved_pfn(pfn))
1626 		return 1;
1627 	return get_page_unless_zero(pfn_to_page(pfn));
1628 }
1629 
hva_to_pfn_remapped(struct vm_area_struct * vma,unsigned long addr,bool * async,bool write_fault,bool * writable,kvm_pfn_t * p_pfn)1630 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
1631 			       unsigned long addr, bool *async,
1632 			       bool write_fault, bool *writable,
1633 			       kvm_pfn_t *p_pfn)
1634 {
1635 	kvm_pfn_t pfn;
1636 	pte_t *ptep;
1637 	spinlock_t *ptl;
1638 	int r;
1639 
1640 	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
1641 	if (r) {
1642 		/*
1643 		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
1644 		 * not call the fault handler, so do it here.
1645 		 */
1646 		bool unlocked = false;
1647 		r = fixup_user_fault(current, current->mm, addr,
1648 				     (write_fault ? FAULT_FLAG_WRITE : 0),
1649 				     &unlocked);
1650 		if (unlocked)
1651 			return -EAGAIN;
1652 		if (r)
1653 			return r;
1654 
1655 		r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
1656 		if (r)
1657 			return r;
1658 	}
1659 
1660 	if (write_fault && !pte_write(*ptep)) {
1661 		pfn = KVM_PFN_ERR_RO_FAULT;
1662 		goto out;
1663 	}
1664 
1665 	if (writable)
1666 		*writable = pte_write(*ptep);
1667 	pfn = pte_pfn(*ptep);
1668 
1669 	/*
1670 	 * Get a reference here because callers of *hva_to_pfn* and
1671 	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
1672 	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
1673 	 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
1674 	 * simply do nothing for reserved pfns.
1675 	 *
1676 	 * Whoever called remap_pfn_range is also going to call e.g.
1677 	 * unmap_mapping_range before the underlying pages are freed,
1678 	 * causing a call to our MMU notifier.
1679 	 *
1680 	 * Certain IO or PFNMAP mappings can be backed with valid
1681 	 * struct pages, but be allocated without refcounting e.g.,
1682 	 * tail pages of non-compound higher order allocations, which
1683 	 * would then underflow the refcount when the caller does the
1684 	 * required put_page. Don't allow those pages here.
1685 	 */
1686 	if (!kvm_try_get_pfn(pfn))
1687 		r = -EFAULT;
1688 
1689 out:
1690 	pte_unmap_unlock(ptep, ptl);
1691 	*p_pfn = pfn;
1692 
1693 	return r;
1694 }
1695 
1696 /*
1697  * Pin guest page in memory and return its pfn.
1698  * @addr: host virtual address which maps memory to the guest
1699  * @atomic: whether this function can sleep
1700  * @async: whether this function need to wait IO complete if the
1701  *         host page is not in the memory
1702  * @write_fault: whether we should get a writable host page
1703  * @writable: whether it allows to map a writable host page for !@write_fault
1704  *
1705  * The function will map a writable host page for these two cases:
1706  * 1): @write_fault = true
1707  * 2): @write_fault = false && @writable, @writable will tell the caller
1708  *     whether the mapping is writable.
1709  */
hva_to_pfn(unsigned long addr,bool atomic,bool * async,bool write_fault,bool * writable)1710 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1711 			bool write_fault, bool *writable)
1712 {
1713 	struct vm_area_struct *vma;
1714 	kvm_pfn_t pfn = 0;
1715 	int npages, r;
1716 
1717 	/* we can do it either atomically or asynchronously, not both */
1718 	BUG_ON(atomic && async);
1719 
1720 	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
1721 		return pfn;
1722 
1723 	if (atomic)
1724 		return KVM_PFN_ERR_FAULT;
1725 
1726 	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
1727 	if (npages == 1)
1728 		return pfn;
1729 
1730 	down_read(&current->mm->mmap_sem);
1731 	if (npages == -EHWPOISON ||
1732 	      (!async && check_user_page_hwpoison(addr))) {
1733 		pfn = KVM_PFN_ERR_HWPOISON;
1734 		goto exit;
1735 	}
1736 
1737 retry:
1738 	vma = find_vma_intersection(current->mm, addr, addr + 1);
1739 
1740 	if (vma == NULL)
1741 		pfn = KVM_PFN_ERR_FAULT;
1742 	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
1743 		r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
1744 		if (r == -EAGAIN)
1745 			goto retry;
1746 		if (r < 0)
1747 			pfn = KVM_PFN_ERR_FAULT;
1748 	} else {
1749 		if (async && vma_is_valid(vma, write_fault))
1750 			*async = true;
1751 		pfn = KVM_PFN_ERR_FAULT;
1752 	}
1753 exit:
1754 	up_read(&current->mm->mmap_sem);
1755 	return pfn;
1756 }
1757 
__gfn_to_pfn_memslot(struct kvm_memory_slot * slot,gfn_t gfn,bool atomic,bool * async,bool write_fault,bool * writable)1758 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
1759 			       bool atomic, bool *async, bool write_fault,
1760 			       bool *writable)
1761 {
1762 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
1763 
1764 	if (addr == KVM_HVA_ERR_RO_BAD) {
1765 		if (writable)
1766 			*writable = false;
1767 		return KVM_PFN_ERR_RO_FAULT;
1768 	}
1769 
1770 	if (kvm_is_error_hva(addr)) {
1771 		if (writable)
1772 			*writable = false;
1773 		return KVM_PFN_NOSLOT;
1774 	}
1775 
1776 	/* Do not map writable pfn in the readonly memslot. */
1777 	if (writable && memslot_is_readonly(slot)) {
1778 		*writable = false;
1779 		writable = NULL;
1780 	}
1781 
1782 	return hva_to_pfn(addr, atomic, async, write_fault,
1783 			  writable);
1784 }
1785 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
1786 
gfn_to_pfn_prot(struct kvm * kvm,gfn_t gfn,bool write_fault,bool * writable)1787 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1788 		      bool *writable)
1789 {
1790 	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
1791 				    write_fault, writable);
1792 }
1793 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1794 
gfn_to_pfn_memslot(struct kvm_memory_slot * slot,gfn_t gfn)1795 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
1796 {
1797 	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
1798 }
1799 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
1800 
gfn_to_pfn_memslot_atomic(struct kvm_memory_slot * slot,gfn_t gfn)1801 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
1802 {
1803 	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
1804 }
1805 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1806 
gfn_to_pfn_atomic(struct kvm * kvm,gfn_t gfn)1807 kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1808 {
1809 	return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
1810 }
1811 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1812 
kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu * vcpu,gfn_t gfn)1813 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
1814 {
1815 	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
1816 }
1817 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
1818 
gfn_to_pfn(struct kvm * kvm,gfn_t gfn)1819 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1820 {
1821 	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
1822 }
1823 EXPORT_SYMBOL_GPL(gfn_to_pfn);
1824 
kvm_vcpu_gfn_to_pfn(struct kvm_vcpu * vcpu,gfn_t gfn)1825 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
1826 {
1827 	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
1828 }
1829 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
1830 
gfn_to_page_many_atomic(struct kvm_memory_slot * slot,gfn_t gfn,struct page ** pages,int nr_pages)1831 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
1832 			    struct page **pages, int nr_pages)
1833 {
1834 	unsigned long addr;
1835 	gfn_t entry = 0;
1836 
1837 	addr = gfn_to_hva_many(slot, gfn, &entry);
1838 	if (kvm_is_error_hva(addr))
1839 		return -1;
1840 
1841 	if (entry < nr_pages)
1842 		return 0;
1843 
1844 	return __get_user_pages_fast(addr, nr_pages, 1, pages);
1845 }
1846 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1847 
kvm_pfn_to_page(kvm_pfn_t pfn)1848 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
1849 {
1850 	if (is_error_noslot_pfn(pfn))
1851 		return KVM_ERR_PTR_BAD_PAGE;
1852 
1853 	if (kvm_is_reserved_pfn(pfn)) {
1854 		WARN_ON(1);
1855 		return KVM_ERR_PTR_BAD_PAGE;
1856 	}
1857 
1858 	return pfn_to_page(pfn);
1859 }
1860 
gfn_to_page(struct kvm * kvm,gfn_t gfn)1861 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1862 {
1863 	kvm_pfn_t pfn;
1864 
1865 	pfn = gfn_to_pfn(kvm, gfn);
1866 
1867 	return kvm_pfn_to_page(pfn);
1868 }
1869 EXPORT_SYMBOL_GPL(gfn_to_page);
1870 
kvm_release_pfn(kvm_pfn_t pfn,bool dirty,struct gfn_to_pfn_cache * cache)1871 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
1872 {
1873 	if (pfn == 0)
1874 		return;
1875 
1876 	if (cache)
1877 		cache->pfn = cache->gfn = 0;
1878 
1879 	if (dirty)
1880 		kvm_release_pfn_dirty(pfn);
1881 	else
1882 		kvm_release_pfn_clean(pfn);
1883 }
1884 
kvm_cache_gfn_to_pfn(struct kvm_memory_slot * slot,gfn_t gfn,struct gfn_to_pfn_cache * cache,u64 gen)1885 static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
1886 				 struct gfn_to_pfn_cache *cache, u64 gen)
1887 {
1888 	kvm_release_pfn(cache->pfn, cache->dirty, cache);
1889 
1890 	cache->pfn = gfn_to_pfn_memslot(slot, gfn);
1891 	cache->gfn = gfn;
1892 	cache->dirty = false;
1893 	cache->generation = gen;
1894 }
1895 
__kvm_map_gfn(struct kvm_memslots * slots,gfn_t gfn,struct kvm_host_map * map,struct gfn_to_pfn_cache * cache,bool atomic)1896 static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
1897 			 struct kvm_host_map *map,
1898 			 struct gfn_to_pfn_cache *cache,
1899 			 bool atomic)
1900 {
1901 	kvm_pfn_t pfn;
1902 	void *hva = NULL;
1903 	struct page *page = KVM_UNMAPPED_PAGE;
1904 	struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
1905 	u64 gen = slots->generation;
1906 
1907 	if (!map)
1908 		return -EINVAL;
1909 
1910 	if (cache) {
1911 		if (!cache->pfn || cache->gfn != gfn ||
1912 			cache->generation != gen) {
1913 			if (atomic)
1914 				return -EAGAIN;
1915 			kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
1916 		}
1917 		pfn = cache->pfn;
1918 	} else {
1919 		if (atomic)
1920 			return -EAGAIN;
1921 		pfn = gfn_to_pfn_memslot(slot, gfn);
1922 	}
1923 	if (is_error_noslot_pfn(pfn))
1924 		return -EINVAL;
1925 
1926 	if (pfn_valid(pfn)) {
1927 		page = pfn_to_page(pfn);
1928 		if (atomic)
1929 			hva = kmap_atomic(page);
1930 		else
1931 			hva = kmap(page);
1932 #ifdef CONFIG_HAS_IOMEM
1933 	} else if (!atomic) {
1934 		hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
1935 	} else {
1936 		return -EINVAL;
1937 #endif
1938 	}
1939 
1940 	if (!hva)
1941 		return -EFAULT;
1942 
1943 	map->page = page;
1944 	map->hva = hva;
1945 	map->pfn = pfn;
1946 	map->gfn = gfn;
1947 
1948 	return 0;
1949 }
1950 
kvm_map_gfn(struct kvm_vcpu * vcpu,gfn_t gfn,struct kvm_host_map * map,struct gfn_to_pfn_cache * cache,bool atomic)1951 int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
1952 		struct gfn_to_pfn_cache *cache, bool atomic)
1953 {
1954 	return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
1955 			cache, atomic);
1956 }
1957 EXPORT_SYMBOL_GPL(kvm_map_gfn);
1958 
kvm_vcpu_map(struct kvm_vcpu * vcpu,gfn_t gfn,struct kvm_host_map * map)1959 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
1960 {
1961 	return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
1962 		NULL, false);
1963 }
1964 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
1965 
__kvm_unmap_gfn(struct kvm_memory_slot * memslot,struct kvm_host_map * map,struct gfn_to_pfn_cache * cache,bool dirty,bool atomic)1966 static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
1967 			struct kvm_host_map *map,
1968 			struct gfn_to_pfn_cache *cache,
1969 			bool dirty, bool atomic)
1970 {
1971 	if (!map)
1972 		return;
1973 
1974 	if (!map->hva)
1975 		return;
1976 
1977 	if (map->page != KVM_UNMAPPED_PAGE) {
1978 		if (atomic)
1979 			kunmap_atomic(map->hva);
1980 		else
1981 			kunmap(map->page);
1982 	}
1983 #ifdef CONFIG_HAS_IOMEM
1984 	else if (!atomic)
1985 		memunmap(map->hva);
1986 	else
1987 		WARN_ONCE(1, "Unexpected unmapping in atomic context");
1988 #endif
1989 
1990 	if (dirty)
1991 		mark_page_dirty_in_slot(memslot, map->gfn);
1992 
1993 	if (cache)
1994 		cache->dirty |= dirty;
1995 	else
1996 		kvm_release_pfn(map->pfn, dirty, NULL);
1997 
1998 	map->hva = NULL;
1999 	map->page = NULL;
2000 }
2001 
kvm_unmap_gfn(struct kvm_vcpu * vcpu,struct kvm_host_map * map,struct gfn_to_pfn_cache * cache,bool dirty,bool atomic)2002 int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
2003 		  struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
2004 {
2005 	__kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
2006 			cache, dirty, atomic);
2007 	return 0;
2008 }
2009 EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
2010 
kvm_vcpu_unmap(struct kvm_vcpu * vcpu,struct kvm_host_map * map,bool dirty)2011 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2012 {
2013 	__kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
2014 			dirty, false);
2015 }
2016 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2017 
kvm_vcpu_gfn_to_page(struct kvm_vcpu * vcpu,gfn_t gfn)2018 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2019 {
2020 	kvm_pfn_t pfn;
2021 
2022 	pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2023 
2024 	return kvm_pfn_to_page(pfn);
2025 }
2026 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2027 
kvm_release_page_clean(struct page * page)2028 void kvm_release_page_clean(struct page *page)
2029 {
2030 	WARN_ON(is_error_page(page));
2031 
2032 	kvm_release_pfn_clean(page_to_pfn(page));
2033 }
2034 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2035 
kvm_release_pfn_clean(kvm_pfn_t pfn)2036 void kvm_release_pfn_clean(kvm_pfn_t pfn)
2037 {
2038 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2039 		put_page(pfn_to_page(pfn));
2040 }
2041 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2042 
kvm_release_page_dirty(struct page * page)2043 void kvm_release_page_dirty(struct page *page)
2044 {
2045 	WARN_ON(is_error_page(page));
2046 
2047 	kvm_release_pfn_dirty(page_to_pfn(page));
2048 }
2049 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2050 
kvm_release_pfn_dirty(kvm_pfn_t pfn)2051 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2052 {
2053 	kvm_set_pfn_dirty(pfn);
2054 	kvm_release_pfn_clean(pfn);
2055 }
2056 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2057 
kvm_set_pfn_dirty(kvm_pfn_t pfn)2058 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2059 {
2060 	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
2061 		struct page *page = pfn_to_page(pfn);
2062 
2063 		SetPageDirty(page);
2064 	}
2065 }
2066 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2067 
kvm_set_pfn_accessed(kvm_pfn_t pfn)2068 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2069 {
2070 	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2071 		mark_page_accessed(pfn_to_page(pfn));
2072 }
2073 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2074 
kvm_get_pfn(kvm_pfn_t pfn)2075 void kvm_get_pfn(kvm_pfn_t pfn)
2076 {
2077 	if (!kvm_is_reserved_pfn(pfn))
2078 		get_page(pfn_to_page(pfn));
2079 }
2080 EXPORT_SYMBOL_GPL(kvm_get_pfn);
2081 
next_segment(unsigned long len,int offset)2082 static int next_segment(unsigned long len, int offset)
2083 {
2084 	if (len > PAGE_SIZE - offset)
2085 		return PAGE_SIZE - offset;
2086 	else
2087 		return len;
2088 }
2089 
__kvm_read_guest_page(struct kvm_memory_slot * slot,gfn_t gfn,void * data,int offset,int len)2090 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2091 				 void *data, int offset, int len)
2092 {
2093 	int r;
2094 	unsigned long addr;
2095 
2096 	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2097 	if (kvm_is_error_hva(addr))
2098 		return -EFAULT;
2099 	r = __copy_from_user(data, (void __user *)addr + offset, len);
2100 	if (r)
2101 		return -EFAULT;
2102 	return 0;
2103 }
2104 
kvm_read_guest_page(struct kvm * kvm,gfn_t gfn,void * data,int offset,int len)2105 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2106 			int len)
2107 {
2108 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2109 
2110 	return __kvm_read_guest_page(slot, gfn, data, offset, len);
2111 }
2112 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2113 
kvm_vcpu_read_guest_page(struct kvm_vcpu * vcpu,gfn_t gfn,void * data,int offset,int len)2114 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2115 			     int offset, int len)
2116 {
2117 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2118 
2119 	return __kvm_read_guest_page(slot, gfn, data, offset, len);
2120 }
2121 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2122 
kvm_read_guest(struct kvm * kvm,gpa_t gpa,void * data,unsigned long len)2123 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2124 {
2125 	gfn_t gfn = gpa >> PAGE_SHIFT;
2126 	int seg;
2127 	int offset = offset_in_page(gpa);
2128 	int ret;
2129 
2130 	while ((seg = next_segment(len, offset)) != 0) {
2131 		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2132 		if (ret < 0)
2133 			return ret;
2134 		offset = 0;
2135 		len -= seg;
2136 		data += seg;
2137 		++gfn;
2138 	}
2139 	return 0;
2140 }
2141 EXPORT_SYMBOL_GPL(kvm_read_guest);
2142 
kvm_vcpu_read_guest(struct kvm_vcpu * vcpu,gpa_t gpa,void * data,unsigned long len)2143 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2144 {
2145 	gfn_t gfn = gpa >> PAGE_SHIFT;
2146 	int seg;
2147 	int offset = offset_in_page(gpa);
2148 	int ret;
2149 
2150 	while ((seg = next_segment(len, offset)) != 0) {
2151 		ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2152 		if (ret < 0)
2153 			return ret;
2154 		offset = 0;
2155 		len -= seg;
2156 		data += seg;
2157 		++gfn;
2158 	}
2159 	return 0;
2160 }
2161 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2162 
__kvm_read_guest_atomic(struct kvm_memory_slot * slot,gfn_t gfn,void * data,int offset,unsigned long len)2163 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2164 			           void *data, int offset, unsigned long len)
2165 {
2166 	int r;
2167 	unsigned long addr;
2168 
2169 	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2170 	if (kvm_is_error_hva(addr))
2171 		return -EFAULT;
2172 	pagefault_disable();
2173 	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2174 	pagefault_enable();
2175 	if (r)
2176 		return -EFAULT;
2177 	return 0;
2178 }
2179 
kvm_read_guest_atomic(struct kvm * kvm,gpa_t gpa,void * data,unsigned long len)2180 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
2181 			  unsigned long len)
2182 {
2183 	gfn_t gfn = gpa >> PAGE_SHIFT;
2184 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2185 	int offset = offset_in_page(gpa);
2186 
2187 	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2188 }
2189 EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
2190 
kvm_vcpu_read_guest_atomic(struct kvm_vcpu * vcpu,gpa_t gpa,void * data,unsigned long len)2191 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2192 			       void *data, unsigned long len)
2193 {
2194 	gfn_t gfn = gpa >> PAGE_SHIFT;
2195 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2196 	int offset = offset_in_page(gpa);
2197 
2198 	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2199 }
2200 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2201 
__kvm_write_guest_page(struct kvm_memory_slot * memslot,gfn_t gfn,const void * data,int offset,int len)2202 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
2203 			          const void *data, int offset, int len)
2204 {
2205 	int r;
2206 	unsigned long addr;
2207 
2208 	addr = gfn_to_hva_memslot(memslot, gfn);
2209 	if (kvm_is_error_hva(addr))
2210 		return -EFAULT;
2211 	r = __copy_to_user((void __user *)addr + offset, data, len);
2212 	if (r)
2213 		return -EFAULT;
2214 	mark_page_dirty_in_slot(memslot, gfn);
2215 	return 0;
2216 }
2217 
kvm_write_guest_page(struct kvm * kvm,gfn_t gfn,const void * data,int offset,int len)2218 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2219 			 const void *data, int offset, int len)
2220 {
2221 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2222 
2223 	return __kvm_write_guest_page(slot, gfn, data, offset, len);
2224 }
2225 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2226 
kvm_vcpu_write_guest_page(struct kvm_vcpu * vcpu,gfn_t gfn,const void * data,int offset,int len)2227 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2228 			      const void *data, int offset, int len)
2229 {
2230 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2231 
2232 	return __kvm_write_guest_page(slot, gfn, data, offset, len);
2233 }
2234 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2235 
kvm_write_guest(struct kvm * kvm,gpa_t gpa,const void * data,unsigned long len)2236 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2237 		    unsigned long len)
2238 {
2239 	gfn_t gfn = gpa >> PAGE_SHIFT;
2240 	int seg;
2241 	int offset = offset_in_page(gpa);
2242 	int ret;
2243 
2244 	while ((seg = next_segment(len, offset)) != 0) {
2245 		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2246 		if (ret < 0)
2247 			return ret;
2248 		offset = 0;
2249 		len -= seg;
2250 		data += seg;
2251 		++gfn;
2252 	}
2253 	return 0;
2254 }
2255 EXPORT_SYMBOL_GPL(kvm_write_guest);
2256 
kvm_vcpu_write_guest(struct kvm_vcpu * vcpu,gpa_t gpa,const void * data,unsigned long len)2257 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2258 		         unsigned long len)
2259 {
2260 	gfn_t gfn = gpa >> PAGE_SHIFT;
2261 	int seg;
2262 	int offset = offset_in_page(gpa);
2263 	int ret;
2264 
2265 	while ((seg = next_segment(len, offset)) != 0) {
2266 		ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2267 		if (ret < 0)
2268 			return ret;
2269 		offset = 0;
2270 		len -= seg;
2271 		data += seg;
2272 		++gfn;
2273 	}
2274 	return 0;
2275 }
2276 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2277 
__kvm_gfn_to_hva_cache_init(struct kvm_memslots * slots,struct gfn_to_hva_cache * ghc,gpa_t gpa,unsigned long len)2278 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2279 				       struct gfn_to_hva_cache *ghc,
2280 				       gpa_t gpa, unsigned long len)
2281 {
2282 	int offset = offset_in_page(gpa);
2283 	gfn_t start_gfn = gpa >> PAGE_SHIFT;
2284 	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2285 	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2286 	gfn_t nr_pages_avail;
2287 	int r = start_gfn <= end_gfn ? 0 : -EINVAL;
2288 
2289 	ghc->gpa = gpa;
2290 	ghc->generation = slots->generation;
2291 	ghc->len = len;
2292 	ghc->hva = KVM_HVA_ERR_BAD;
2293 
2294 	/*
2295 	 * If the requested region crosses two memslots, we still
2296 	 * verify that the entire region is valid here.
2297 	 */
2298 	while (!r && start_gfn <= end_gfn) {
2299 		ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2300 		ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2301 					   &nr_pages_avail);
2302 		if (kvm_is_error_hva(ghc->hva))
2303 			r = -EFAULT;
2304 		start_gfn += nr_pages_avail;
2305 	}
2306 
2307 	/* Use the slow path for cross page reads and writes. */
2308 	if (!r && nr_pages_needed == 1)
2309 		ghc->hva += offset;
2310 	else
2311 		ghc->memslot = NULL;
2312 
2313 	return r;
2314 }
2315 
kvm_gfn_to_hva_cache_init(struct kvm * kvm,struct gfn_to_hva_cache * ghc,gpa_t gpa,unsigned long len)2316 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2317 			      gpa_t gpa, unsigned long len)
2318 {
2319 	struct kvm_memslots *slots = kvm_memslots(kvm);
2320 	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2321 }
2322 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2323 
kvm_write_guest_offset_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned int offset,unsigned long len)2324 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2325 				  void *data, unsigned int offset,
2326 				  unsigned long len)
2327 {
2328 	struct kvm_memslots *slots = kvm_memslots(kvm);
2329 	int r;
2330 	gpa_t gpa = ghc->gpa + offset;
2331 
2332 	BUG_ON(len + offset > ghc->len);
2333 
2334 	if (slots->generation != ghc->generation)
2335 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
2336 
2337 	if (kvm_is_error_hva(ghc->hva))
2338 		return -EFAULT;
2339 
2340 	if (unlikely(!ghc->memslot))
2341 		return kvm_write_guest(kvm, gpa, data, len);
2342 
2343 	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
2344 	if (r)
2345 		return -EFAULT;
2346 	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
2347 
2348 	return 0;
2349 }
2350 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
2351 
kvm_write_guest_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned long len)2352 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2353 			   void *data, unsigned long len)
2354 {
2355 	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
2356 }
2357 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
2358 
kvm_read_guest_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned long len)2359 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2360 			   void *data, unsigned long len)
2361 {
2362 	struct kvm_memslots *slots = kvm_memslots(kvm);
2363 	int r;
2364 
2365 	BUG_ON(len > ghc->len);
2366 
2367 	if (slots->generation != ghc->generation)
2368 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
2369 
2370 	if (kvm_is_error_hva(ghc->hva))
2371 		return -EFAULT;
2372 
2373 	if (unlikely(!ghc->memslot))
2374 		return kvm_read_guest(kvm, ghc->gpa, data, len);
2375 
2376 	r = __copy_from_user(data, (void __user *)ghc->hva, len);
2377 	if (r)
2378 		return -EFAULT;
2379 
2380 	return 0;
2381 }
2382 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
2383 
kvm_clear_guest_page(struct kvm * kvm,gfn_t gfn,int offset,int len)2384 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
2385 {
2386 	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
2387 
2388 	return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
2389 }
2390 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
2391 
kvm_clear_guest(struct kvm * kvm,gpa_t gpa,unsigned long len)2392 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
2393 {
2394 	gfn_t gfn = gpa >> PAGE_SHIFT;
2395 	int seg;
2396 	int offset = offset_in_page(gpa);
2397 	int ret;
2398 
2399 	while ((seg = next_segment(len, offset)) != 0) {
2400 		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
2401 		if (ret < 0)
2402 			return ret;
2403 		offset = 0;
2404 		len -= seg;
2405 		++gfn;
2406 	}
2407 	return 0;
2408 }
2409 EXPORT_SYMBOL_GPL(kvm_clear_guest);
2410 
mark_page_dirty_in_slot(struct kvm_memory_slot * memslot,gfn_t gfn)2411 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
2412 				    gfn_t gfn)
2413 {
2414 	if (memslot && memslot->dirty_bitmap) {
2415 		unsigned long rel_gfn = gfn - memslot->base_gfn;
2416 
2417 		set_bit_le(rel_gfn, memslot->dirty_bitmap);
2418 	}
2419 }
2420 
mark_page_dirty(struct kvm * kvm,gfn_t gfn)2421 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
2422 {
2423 	struct kvm_memory_slot *memslot;
2424 
2425 	memslot = gfn_to_memslot(kvm, gfn);
2426 	mark_page_dirty_in_slot(memslot, gfn);
2427 }
2428 EXPORT_SYMBOL_GPL(mark_page_dirty);
2429 
kvm_vcpu_mark_page_dirty(struct kvm_vcpu * vcpu,gfn_t gfn)2430 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
2431 {
2432 	struct kvm_memory_slot *memslot;
2433 
2434 	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2435 	mark_page_dirty_in_slot(memslot, gfn);
2436 }
2437 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
2438 
kvm_sigset_activate(struct kvm_vcpu * vcpu)2439 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
2440 {
2441 	if (!vcpu->sigset_active)
2442 		return;
2443 
2444 	/*
2445 	 * This does a lockless modification of ->real_blocked, which is fine
2446 	 * because, only current can change ->real_blocked and all readers of
2447 	 * ->real_blocked don't care as long ->real_blocked is always a subset
2448 	 * of ->blocked.
2449 	 */
2450 	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
2451 }
2452 
kvm_sigset_deactivate(struct kvm_vcpu * vcpu)2453 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
2454 {
2455 	if (!vcpu->sigset_active)
2456 		return;
2457 
2458 	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
2459 	sigemptyset(&current->real_blocked);
2460 }
2461 
grow_halt_poll_ns(struct kvm_vcpu * vcpu)2462 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
2463 {
2464 	unsigned int old, val, grow, grow_start;
2465 
2466 	old = val = vcpu->halt_poll_ns;
2467 	grow_start = READ_ONCE(halt_poll_ns_grow_start);
2468 	grow = READ_ONCE(halt_poll_ns_grow);
2469 	if (!grow)
2470 		goto out;
2471 
2472 	val *= grow;
2473 	if (val < grow_start)
2474 		val = grow_start;
2475 
2476 	if (val > halt_poll_ns)
2477 		val = halt_poll_ns;
2478 
2479 	vcpu->halt_poll_ns = val;
2480 out:
2481 	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
2482 }
2483 
shrink_halt_poll_ns(struct kvm_vcpu * vcpu)2484 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
2485 {
2486 	unsigned int old, val, shrink, grow_start;
2487 
2488 	old = val = vcpu->halt_poll_ns;
2489 	shrink = READ_ONCE(halt_poll_ns_shrink);
2490 	grow_start = READ_ONCE(halt_poll_ns_grow_start);
2491 	if (shrink == 0)
2492 		val = 0;
2493 	else
2494 		val /= shrink;
2495 
2496 	if (val < grow_start)
2497 		val = 0;
2498 
2499 	vcpu->halt_poll_ns = val;
2500 	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
2501 }
2502 
kvm_vcpu_check_block(struct kvm_vcpu * vcpu)2503 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
2504 {
2505 	int ret = -EINTR;
2506 	int idx = srcu_read_lock(&vcpu->kvm->srcu);
2507 
2508 	if (kvm_arch_vcpu_runnable(vcpu)) {
2509 		kvm_make_request(KVM_REQ_UNHALT, vcpu);
2510 		goto out;
2511 	}
2512 	if (kvm_cpu_has_pending_timer(vcpu))
2513 		goto out;
2514 	if (signal_pending(current))
2515 		goto out;
2516 
2517 	ret = 0;
2518 out:
2519 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
2520 	return ret;
2521 }
2522 
2523 /*
2524  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
2525  */
kvm_vcpu_block(struct kvm_vcpu * vcpu)2526 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2527 {
2528 	ktime_t start, cur;
2529 	DECLARE_SWAITQUEUE(wait);
2530 	bool waited = false;
2531 	u64 block_ns;
2532 
2533 	kvm_arch_vcpu_blocking(vcpu);
2534 
2535 	start = cur = ktime_get();
2536 	if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
2537 		ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
2538 
2539 		++vcpu->stat.halt_attempted_poll;
2540 		do {
2541 			/*
2542 			 * This sets KVM_REQ_UNHALT if an interrupt
2543 			 * arrives.
2544 			 */
2545 			if (kvm_vcpu_check_block(vcpu) < 0) {
2546 				++vcpu->stat.halt_successful_poll;
2547 				if (!vcpu_valid_wakeup(vcpu))
2548 					++vcpu->stat.halt_poll_invalid;
2549 				goto out;
2550 			}
2551 			cur = ktime_get();
2552 		} while (single_task_running() && ktime_before(cur, stop));
2553 	}
2554 
2555 	for (;;) {
2556 		prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
2557 
2558 		if (kvm_vcpu_check_block(vcpu) < 0)
2559 			break;
2560 
2561 		waited = true;
2562 		schedule();
2563 	}
2564 
2565 	finish_swait(&vcpu->wq, &wait);
2566 	cur = ktime_get();
2567 out:
2568 	kvm_arch_vcpu_unblocking(vcpu);
2569 	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
2570 
2571 	if (!kvm_arch_no_poll(vcpu)) {
2572 		if (!vcpu_valid_wakeup(vcpu)) {
2573 			shrink_halt_poll_ns(vcpu);
2574 		} else if (halt_poll_ns) {
2575 			if (block_ns <= vcpu->halt_poll_ns)
2576 				;
2577 			/* we had a long block, shrink polling */
2578 			else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
2579 				shrink_halt_poll_ns(vcpu);
2580 			/* we had a short halt and our poll time is too small */
2581 			else if (vcpu->halt_poll_ns < halt_poll_ns &&
2582 				block_ns < halt_poll_ns)
2583 				grow_halt_poll_ns(vcpu);
2584 		} else {
2585 			vcpu->halt_poll_ns = 0;
2586 		}
2587 	}
2588 
2589 	trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
2590 	kvm_arch_vcpu_block_finish(vcpu);
2591 }
2592 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
2593 
kvm_vcpu_wake_up(struct kvm_vcpu * vcpu)2594 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
2595 {
2596 	struct swait_queue_head *wqp;
2597 
2598 	wqp = kvm_arch_vcpu_wq(vcpu);
2599 	if (swq_has_sleeper(wqp)) {
2600 		swake_up_one(wqp);
2601 		WRITE_ONCE(vcpu->ready, true);
2602 		++vcpu->stat.halt_wakeup;
2603 		return true;
2604 	}
2605 
2606 	return false;
2607 }
2608 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
2609 
2610 #ifndef CONFIG_S390
2611 /*
2612  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
2613  */
kvm_vcpu_kick(struct kvm_vcpu * vcpu)2614 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
2615 {
2616 	int me;
2617 	int cpu = vcpu->cpu;
2618 
2619 	if (kvm_vcpu_wake_up(vcpu))
2620 		return;
2621 
2622 	me = get_cpu();
2623 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
2624 		if (kvm_arch_vcpu_should_kick(vcpu))
2625 			smp_send_reschedule(cpu);
2626 	put_cpu();
2627 }
2628 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
2629 #endif /* !CONFIG_S390 */
2630 
kvm_vcpu_yield_to(struct kvm_vcpu * target)2631 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
2632 {
2633 	struct pid *pid;
2634 	struct task_struct *task = NULL;
2635 	int ret = 0;
2636 
2637 	rcu_read_lock();
2638 	pid = rcu_dereference(target->pid);
2639 	if (pid)
2640 		task = get_pid_task(pid, PIDTYPE_PID);
2641 	rcu_read_unlock();
2642 	if (!task)
2643 		return ret;
2644 	ret = yield_to(task, 1);
2645 	put_task_struct(task);
2646 
2647 	return ret;
2648 }
2649 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
2650 
2651 /*
2652  * Helper that checks whether a VCPU is eligible for directed yield.
2653  * Most eligible candidate to yield is decided by following heuristics:
2654  *
2655  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
2656  *  (preempted lock holder), indicated by @in_spin_loop.
2657  *  Set at the beiginning and cleared at the end of interception/PLE handler.
2658  *
2659  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
2660  *  chance last time (mostly it has become eligible now since we have probably
2661  *  yielded to lockholder in last iteration. This is done by toggling
2662  *  @dy_eligible each time a VCPU checked for eligibility.)
2663  *
2664  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
2665  *  to preempted lock-holder could result in wrong VCPU selection and CPU
2666  *  burning. Giving priority for a potential lock-holder increases lock
2667  *  progress.
2668  *
2669  *  Since algorithm is based on heuristics, accessing another VCPU data without
2670  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
2671  *  and continue with next VCPU and so on.
2672  */
kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu * vcpu)2673 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
2674 {
2675 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
2676 	bool eligible;
2677 
2678 	eligible = !vcpu->spin_loop.in_spin_loop ||
2679 		    vcpu->spin_loop.dy_eligible;
2680 
2681 	if (vcpu->spin_loop.in_spin_loop)
2682 		kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
2683 
2684 	return eligible;
2685 #else
2686 	return true;
2687 #endif
2688 }
2689 
2690 /*
2691  * Unlike kvm_arch_vcpu_runnable, this function is called outside
2692  * a vcpu_load/vcpu_put pair.  However, for most architectures
2693  * kvm_arch_vcpu_runnable does not require vcpu_load.
2694  */
kvm_arch_dy_runnable(struct kvm_vcpu * vcpu)2695 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
2696 {
2697 	return kvm_arch_vcpu_runnable(vcpu);
2698 }
2699 
vcpu_dy_runnable(struct kvm_vcpu * vcpu)2700 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
2701 {
2702 	if (kvm_arch_dy_runnable(vcpu))
2703 		return true;
2704 
2705 #ifdef CONFIG_KVM_ASYNC_PF
2706 	if (!list_empty_careful(&vcpu->async_pf.done))
2707 		return true;
2708 #endif
2709 
2710 	return false;
2711 }
2712 
kvm_vcpu_on_spin(struct kvm_vcpu * me,bool yield_to_kernel_mode)2713 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
2714 {
2715 	struct kvm *kvm = me->kvm;
2716 	struct kvm_vcpu *vcpu;
2717 	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
2718 	int yielded = 0;
2719 	int try = 3;
2720 	int pass;
2721 	int i;
2722 
2723 	kvm_vcpu_set_in_spin_loop(me, true);
2724 	/*
2725 	 * We boost the priority of a VCPU that is runnable but not
2726 	 * currently running, because it got preempted by something
2727 	 * else and called schedule in __vcpu_run.  Hopefully that
2728 	 * VCPU is holding the lock that we need and will release it.
2729 	 * We approximate round-robin by starting at the last boosted VCPU.
2730 	 */
2731 	for (pass = 0; pass < 2 && !yielded && try; pass++) {
2732 		kvm_for_each_vcpu(i, vcpu, kvm) {
2733 			if (!pass && i <= last_boosted_vcpu) {
2734 				i = last_boosted_vcpu;
2735 				continue;
2736 			} else if (pass && i > last_boosted_vcpu)
2737 				break;
2738 			if (!READ_ONCE(vcpu->ready))
2739 				continue;
2740 			if (vcpu == me)
2741 				continue;
2742 			if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
2743 				continue;
2744 			if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
2745 				!kvm_arch_vcpu_in_kernel(vcpu))
2746 				continue;
2747 			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
2748 				continue;
2749 
2750 			yielded = kvm_vcpu_yield_to(vcpu);
2751 			if (yielded > 0) {
2752 				kvm->last_boosted_vcpu = i;
2753 				break;
2754 			} else if (yielded < 0) {
2755 				try--;
2756 				if (!try)
2757 					break;
2758 			}
2759 		}
2760 	}
2761 	kvm_vcpu_set_in_spin_loop(me, false);
2762 
2763 	/* Ensure vcpu is not eligible during next spinloop */
2764 	kvm_vcpu_set_dy_eligible(me, false);
2765 }
2766 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
2767 
kvm_vcpu_fault(struct vm_fault * vmf)2768 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
2769 {
2770 	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
2771 	struct page *page;
2772 
2773 	if (vmf->pgoff == 0)
2774 		page = virt_to_page(vcpu->run);
2775 #ifdef CONFIG_X86
2776 	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
2777 		page = virt_to_page(vcpu->arch.pio_data);
2778 #endif
2779 #ifdef CONFIG_KVM_MMIO
2780 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
2781 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
2782 #endif
2783 	else
2784 		return kvm_arch_vcpu_fault(vcpu, vmf);
2785 	get_page(page);
2786 	vmf->page = page;
2787 	return 0;
2788 }
2789 
2790 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
2791 	.fault = kvm_vcpu_fault,
2792 };
2793 
kvm_vcpu_mmap(struct file * file,struct vm_area_struct * vma)2794 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2795 {
2796 	vma->vm_ops = &kvm_vcpu_vm_ops;
2797 	return 0;
2798 }
2799 
kvm_vcpu_release(struct inode * inode,struct file * filp)2800 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2801 {
2802 	struct kvm_vcpu *vcpu = filp->private_data;
2803 
2804 	debugfs_remove_recursive(vcpu->debugfs_dentry);
2805 	kvm_put_kvm(vcpu->kvm);
2806 	return 0;
2807 }
2808 
2809 static struct file_operations kvm_vcpu_fops = {
2810 	.release        = kvm_vcpu_release,
2811 	.unlocked_ioctl = kvm_vcpu_ioctl,
2812 	.mmap           = kvm_vcpu_mmap,
2813 	.llseek		= noop_llseek,
2814 	KVM_COMPAT(kvm_vcpu_compat_ioctl),
2815 };
2816 
2817 /*
2818  * Allocates an inode for the vcpu.
2819  */
create_vcpu_fd(struct kvm_vcpu * vcpu)2820 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2821 {
2822 	char name[8 + 1 + ITOA_MAX_LEN + 1];
2823 
2824 	snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
2825 	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
2826 }
2827 
kvm_create_vcpu_debugfs(struct kvm_vcpu * vcpu)2828 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
2829 {
2830 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
2831 	char dir_name[ITOA_MAX_LEN * 2];
2832 
2833 	if (!debugfs_initialized())
2834 		return;
2835 
2836 	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
2837 	vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
2838 						  vcpu->kvm->debugfs_dentry);
2839 
2840 	kvm_arch_create_vcpu_debugfs(vcpu);
2841 #endif
2842 }
2843 
2844 /*
2845  * Creates some virtual cpus.  Good luck creating more than one.
2846  */
kvm_vm_ioctl_create_vcpu(struct kvm * kvm,u32 id)2847 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
2848 {
2849 	int r;
2850 	struct kvm_vcpu *vcpu;
2851 
2852 	if (id >= KVM_MAX_VCPU_ID)
2853 		return -EINVAL;
2854 
2855 	mutex_lock(&kvm->lock);
2856 	if (kvm->created_vcpus == KVM_MAX_VCPUS) {
2857 		mutex_unlock(&kvm->lock);
2858 		return -EINVAL;
2859 	}
2860 
2861 	kvm->created_vcpus++;
2862 	mutex_unlock(&kvm->lock);
2863 
2864 	vcpu = kvm_arch_vcpu_create(kvm, id);
2865 	if (IS_ERR(vcpu)) {
2866 		r = PTR_ERR(vcpu);
2867 		goto vcpu_decrement;
2868 	}
2869 
2870 	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2871 
2872 	r = kvm_arch_vcpu_setup(vcpu);
2873 	if (r)
2874 		goto vcpu_destroy;
2875 
2876 	kvm_create_vcpu_debugfs(vcpu);
2877 
2878 	mutex_lock(&kvm->lock);
2879 	if (kvm_get_vcpu_by_id(kvm, id)) {
2880 		r = -EEXIST;
2881 		goto unlock_vcpu_destroy;
2882 	}
2883 
2884 	vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
2885 	BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
2886 
2887 	/* Now it's all set up, let userspace reach it */
2888 	kvm_get_kvm(kvm);
2889 	r = create_vcpu_fd(vcpu);
2890 	if (r < 0) {
2891 		kvm_put_kvm(kvm);
2892 		goto unlock_vcpu_destroy;
2893 	}
2894 
2895 	kvm->vcpus[vcpu->vcpu_idx] = vcpu;
2896 
2897 	/*
2898 	 * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
2899 	 * before kvm->online_vcpu's incremented value.
2900 	 */
2901 	smp_wmb();
2902 	atomic_inc(&kvm->online_vcpus);
2903 
2904 	mutex_unlock(&kvm->lock);
2905 	kvm_arch_vcpu_postcreate(vcpu);
2906 	return r;
2907 
2908 unlock_vcpu_destroy:
2909 	mutex_unlock(&kvm->lock);
2910 	debugfs_remove_recursive(vcpu->debugfs_dentry);
2911 vcpu_destroy:
2912 	kvm_arch_vcpu_destroy(vcpu);
2913 vcpu_decrement:
2914 	mutex_lock(&kvm->lock);
2915 	kvm->created_vcpus--;
2916 	mutex_unlock(&kvm->lock);
2917 	return r;
2918 }
2919 
kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu * vcpu,sigset_t * sigset)2920 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2921 {
2922 	if (sigset) {
2923 		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2924 		vcpu->sigset_active = 1;
2925 		vcpu->sigset = *sigset;
2926 	} else
2927 		vcpu->sigset_active = 0;
2928 	return 0;
2929 }
2930 
kvm_vcpu_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)2931 static long kvm_vcpu_ioctl(struct file *filp,
2932 			   unsigned int ioctl, unsigned long arg)
2933 {
2934 	struct kvm_vcpu *vcpu = filp->private_data;
2935 	void __user *argp = (void __user *)arg;
2936 	int r;
2937 	struct kvm_fpu *fpu = NULL;
2938 	struct kvm_sregs *kvm_sregs = NULL;
2939 
2940 	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
2941 		return -EIO;
2942 
2943 	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
2944 		return -EINVAL;
2945 
2946 	/*
2947 	 * Some architectures have vcpu ioctls that are asynchronous to vcpu
2948 	 * execution; mutex_lock() would break them.
2949 	 */
2950 	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
2951 	if (r != -ENOIOCTLCMD)
2952 		return r;
2953 
2954 	if (mutex_lock_killable(&vcpu->mutex))
2955 		return -EINTR;
2956 	switch (ioctl) {
2957 	case KVM_RUN: {
2958 		struct pid *oldpid;
2959 		r = -EINVAL;
2960 		if (arg)
2961 			goto out;
2962 		oldpid = rcu_access_pointer(vcpu->pid);
2963 		if (unlikely(oldpid != task_pid(current))) {
2964 			/* The thread running this VCPU changed. */
2965 			struct pid *newpid;
2966 
2967 			r = kvm_arch_vcpu_run_pid_change(vcpu);
2968 			if (r)
2969 				break;
2970 
2971 			newpid = get_task_pid(current, PIDTYPE_PID);
2972 			rcu_assign_pointer(vcpu->pid, newpid);
2973 			if (oldpid)
2974 				synchronize_rcu();
2975 			put_pid(oldpid);
2976 		}
2977 		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
2978 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
2979 		break;
2980 	}
2981 	case KVM_GET_REGS: {
2982 		struct kvm_regs *kvm_regs;
2983 
2984 		r = -ENOMEM;
2985 		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
2986 		if (!kvm_regs)
2987 			goto out;
2988 		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
2989 		if (r)
2990 			goto out_free1;
2991 		r = -EFAULT;
2992 		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
2993 			goto out_free1;
2994 		r = 0;
2995 out_free1:
2996 		kfree(kvm_regs);
2997 		break;
2998 	}
2999 	case KVM_SET_REGS: {
3000 		struct kvm_regs *kvm_regs;
3001 
3002 		r = -ENOMEM;
3003 		kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3004 		if (IS_ERR(kvm_regs)) {
3005 			r = PTR_ERR(kvm_regs);
3006 			goto out;
3007 		}
3008 		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3009 		kfree(kvm_regs);
3010 		break;
3011 	}
3012 	case KVM_GET_SREGS: {
3013 		kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3014 				    GFP_KERNEL_ACCOUNT);
3015 		r = -ENOMEM;
3016 		if (!kvm_sregs)
3017 			goto out;
3018 		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3019 		if (r)
3020 			goto out;
3021 		r = -EFAULT;
3022 		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3023 			goto out;
3024 		r = 0;
3025 		break;
3026 	}
3027 	case KVM_SET_SREGS: {
3028 		kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3029 		if (IS_ERR(kvm_sregs)) {
3030 			r = PTR_ERR(kvm_sregs);
3031 			kvm_sregs = NULL;
3032 			goto out;
3033 		}
3034 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3035 		break;
3036 	}
3037 	case KVM_GET_MP_STATE: {
3038 		struct kvm_mp_state mp_state;
3039 
3040 		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3041 		if (r)
3042 			goto out;
3043 		r = -EFAULT;
3044 		if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3045 			goto out;
3046 		r = 0;
3047 		break;
3048 	}
3049 	case KVM_SET_MP_STATE: {
3050 		struct kvm_mp_state mp_state;
3051 
3052 		r = -EFAULT;
3053 		if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3054 			goto out;
3055 		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3056 		break;
3057 	}
3058 	case KVM_TRANSLATE: {
3059 		struct kvm_translation tr;
3060 
3061 		r = -EFAULT;
3062 		if (copy_from_user(&tr, argp, sizeof(tr)))
3063 			goto out;
3064 		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3065 		if (r)
3066 			goto out;
3067 		r = -EFAULT;
3068 		if (copy_to_user(argp, &tr, sizeof(tr)))
3069 			goto out;
3070 		r = 0;
3071 		break;
3072 	}
3073 	case KVM_SET_GUEST_DEBUG: {
3074 		struct kvm_guest_debug dbg;
3075 
3076 		r = -EFAULT;
3077 		if (copy_from_user(&dbg, argp, sizeof(dbg)))
3078 			goto out;
3079 		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3080 		break;
3081 	}
3082 	case KVM_SET_SIGNAL_MASK: {
3083 		struct kvm_signal_mask __user *sigmask_arg = argp;
3084 		struct kvm_signal_mask kvm_sigmask;
3085 		sigset_t sigset, *p;
3086 
3087 		p = NULL;
3088 		if (argp) {
3089 			r = -EFAULT;
3090 			if (copy_from_user(&kvm_sigmask, argp,
3091 					   sizeof(kvm_sigmask)))
3092 				goto out;
3093 			r = -EINVAL;
3094 			if (kvm_sigmask.len != sizeof(sigset))
3095 				goto out;
3096 			r = -EFAULT;
3097 			if (copy_from_user(&sigset, sigmask_arg->sigset,
3098 					   sizeof(sigset)))
3099 				goto out;
3100 			p = &sigset;
3101 		}
3102 		r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3103 		break;
3104 	}
3105 	case KVM_GET_FPU: {
3106 		fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3107 		r = -ENOMEM;
3108 		if (!fpu)
3109 			goto out;
3110 		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3111 		if (r)
3112 			goto out;
3113 		r = -EFAULT;
3114 		if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3115 			goto out;
3116 		r = 0;
3117 		break;
3118 	}
3119 	case KVM_SET_FPU: {
3120 		fpu = memdup_user(argp, sizeof(*fpu));
3121 		if (IS_ERR(fpu)) {
3122 			r = PTR_ERR(fpu);
3123 			fpu = NULL;
3124 			goto out;
3125 		}
3126 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3127 		break;
3128 	}
3129 	default:
3130 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3131 	}
3132 out:
3133 	mutex_unlock(&vcpu->mutex);
3134 	kfree(fpu);
3135 	kfree(kvm_sregs);
3136 	return r;
3137 }
3138 
3139 #ifdef CONFIG_KVM_COMPAT
kvm_vcpu_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)3140 static long kvm_vcpu_compat_ioctl(struct file *filp,
3141 				  unsigned int ioctl, unsigned long arg)
3142 {
3143 	struct kvm_vcpu *vcpu = filp->private_data;
3144 	void __user *argp = compat_ptr(arg);
3145 	int r;
3146 
3147 	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
3148 		return -EIO;
3149 
3150 	switch (ioctl) {
3151 	case KVM_SET_SIGNAL_MASK: {
3152 		struct kvm_signal_mask __user *sigmask_arg = argp;
3153 		struct kvm_signal_mask kvm_sigmask;
3154 		sigset_t sigset;
3155 
3156 		if (argp) {
3157 			r = -EFAULT;
3158 			if (copy_from_user(&kvm_sigmask, argp,
3159 					   sizeof(kvm_sigmask)))
3160 				goto out;
3161 			r = -EINVAL;
3162 			if (kvm_sigmask.len != sizeof(compat_sigset_t))
3163 				goto out;
3164 			r = -EFAULT;
3165 			if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
3166 				goto out;
3167 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3168 		} else
3169 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3170 		break;
3171 	}
3172 	default:
3173 		r = kvm_vcpu_ioctl(filp, ioctl, arg);
3174 	}
3175 
3176 out:
3177 	return r;
3178 }
3179 #endif
3180 
kvm_device_mmap(struct file * filp,struct vm_area_struct * vma)3181 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3182 {
3183 	struct kvm_device *dev = filp->private_data;
3184 
3185 	if (dev->ops->mmap)
3186 		return dev->ops->mmap(dev, vma);
3187 
3188 	return -ENODEV;
3189 }
3190 
kvm_device_ioctl_attr(struct kvm_device * dev,int (* accessor)(struct kvm_device * dev,struct kvm_device_attr * attr),unsigned long arg)3191 static int kvm_device_ioctl_attr(struct kvm_device *dev,
3192 				 int (*accessor)(struct kvm_device *dev,
3193 						 struct kvm_device_attr *attr),
3194 				 unsigned long arg)
3195 {
3196 	struct kvm_device_attr attr;
3197 
3198 	if (!accessor)
3199 		return -EPERM;
3200 
3201 	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
3202 		return -EFAULT;
3203 
3204 	return accessor(dev, &attr);
3205 }
3206 
kvm_device_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)3207 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
3208 			     unsigned long arg)
3209 {
3210 	struct kvm_device *dev = filp->private_data;
3211 
3212 	if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
3213 		return -EIO;
3214 
3215 	switch (ioctl) {
3216 	case KVM_SET_DEVICE_ATTR:
3217 		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
3218 	case KVM_GET_DEVICE_ATTR:
3219 		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
3220 	case KVM_HAS_DEVICE_ATTR:
3221 		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
3222 	default:
3223 		if (dev->ops->ioctl)
3224 			return dev->ops->ioctl(dev, ioctl, arg);
3225 
3226 		return -ENOTTY;
3227 	}
3228 }
3229 
kvm_device_release(struct inode * inode,struct file * filp)3230 static int kvm_device_release(struct inode *inode, struct file *filp)
3231 {
3232 	struct kvm_device *dev = filp->private_data;
3233 	struct kvm *kvm = dev->kvm;
3234 
3235 	if (dev->ops->release) {
3236 		mutex_lock(&kvm->lock);
3237 		list_del(&dev->vm_node);
3238 		dev->ops->release(dev);
3239 		mutex_unlock(&kvm->lock);
3240 	}
3241 
3242 	kvm_put_kvm(kvm);
3243 	return 0;
3244 }
3245 
3246 static const struct file_operations kvm_device_fops = {
3247 	.unlocked_ioctl = kvm_device_ioctl,
3248 	.release = kvm_device_release,
3249 	KVM_COMPAT(kvm_device_ioctl),
3250 	.mmap = kvm_device_mmap,
3251 };
3252 
kvm_device_from_filp(struct file * filp)3253 struct kvm_device *kvm_device_from_filp(struct file *filp)
3254 {
3255 	if (filp->f_op != &kvm_device_fops)
3256 		return NULL;
3257 
3258 	return filp->private_data;
3259 }
3260 
3261 static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
3262 #ifdef CONFIG_KVM_MPIC
3263 	[KVM_DEV_TYPE_FSL_MPIC_20]	= &kvm_mpic_ops,
3264 	[KVM_DEV_TYPE_FSL_MPIC_42]	= &kvm_mpic_ops,
3265 #endif
3266 };
3267 
kvm_register_device_ops(struct kvm_device_ops * ops,u32 type)3268 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
3269 {
3270 	if (type >= ARRAY_SIZE(kvm_device_ops_table))
3271 		return -ENOSPC;
3272 
3273 	if (kvm_device_ops_table[type] != NULL)
3274 		return -EEXIST;
3275 
3276 	kvm_device_ops_table[type] = ops;
3277 	return 0;
3278 }
3279 
kvm_unregister_device_ops(u32 type)3280 void kvm_unregister_device_ops(u32 type)
3281 {
3282 	if (kvm_device_ops_table[type] != NULL)
3283 		kvm_device_ops_table[type] = NULL;
3284 }
3285 
kvm_ioctl_create_device(struct kvm * kvm,struct kvm_create_device * cd)3286 static int kvm_ioctl_create_device(struct kvm *kvm,
3287 				   struct kvm_create_device *cd)
3288 {
3289 	struct kvm_device_ops *ops = NULL;
3290 	struct kvm_device *dev;
3291 	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
3292 	int type;
3293 	int ret;
3294 
3295 	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
3296 		return -ENODEV;
3297 
3298 	type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
3299 	ops = kvm_device_ops_table[type];
3300 	if (ops == NULL)
3301 		return -ENODEV;
3302 
3303 	if (test)
3304 		return 0;
3305 
3306 	dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
3307 	if (!dev)
3308 		return -ENOMEM;
3309 
3310 	dev->ops = ops;
3311 	dev->kvm = kvm;
3312 
3313 	mutex_lock(&kvm->lock);
3314 	ret = ops->create(dev, type);
3315 	if (ret < 0) {
3316 		mutex_unlock(&kvm->lock);
3317 		kfree(dev);
3318 		return ret;
3319 	}
3320 	list_add(&dev->vm_node, &kvm->devices);
3321 	mutex_unlock(&kvm->lock);
3322 
3323 	if (ops->init)
3324 		ops->init(dev);
3325 
3326 	kvm_get_kvm(kvm);
3327 	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
3328 	if (ret < 0) {
3329 		kvm_put_kvm(kvm);
3330 		mutex_lock(&kvm->lock);
3331 		list_del(&dev->vm_node);
3332 		if (ops->release)
3333 			ops->release(dev);
3334 		mutex_unlock(&kvm->lock);
3335 		if (ops->destroy)
3336 			ops->destroy(dev);
3337 		return ret;
3338 	}
3339 
3340 	cd->fd = ret;
3341 	return 0;
3342 }
3343 
kvm_vm_ioctl_check_extension_generic(struct kvm * kvm,long arg)3344 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
3345 {
3346 	switch (arg) {
3347 	case KVM_CAP_USER_MEMORY:
3348 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
3349 	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
3350 	case KVM_CAP_INTERNAL_ERROR_DATA:
3351 #ifdef CONFIG_HAVE_KVM_MSI
3352 	case KVM_CAP_SIGNAL_MSI:
3353 #endif
3354 #ifdef CONFIG_HAVE_KVM_IRQFD
3355 	case KVM_CAP_IRQFD:
3356 	case KVM_CAP_IRQFD_RESAMPLE:
3357 #endif
3358 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
3359 	case KVM_CAP_CHECK_EXTENSION_VM:
3360 	case KVM_CAP_ENABLE_CAP_VM:
3361 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3362 	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
3363 #endif
3364 		return 1;
3365 #ifdef CONFIG_KVM_MMIO
3366 	case KVM_CAP_COALESCED_MMIO:
3367 		return KVM_COALESCED_MMIO_PAGE_OFFSET;
3368 	case KVM_CAP_COALESCED_PIO:
3369 		return 1;
3370 #endif
3371 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3372 	case KVM_CAP_IRQ_ROUTING:
3373 		return KVM_MAX_IRQ_ROUTES;
3374 #endif
3375 #if KVM_ADDRESS_SPACE_NUM > 1
3376 	case KVM_CAP_MULTI_ADDRESS_SPACE:
3377 		return KVM_ADDRESS_SPACE_NUM;
3378 #endif
3379 	case KVM_CAP_NR_MEMSLOTS:
3380 		return KVM_USER_MEM_SLOTS;
3381 	default:
3382 		break;
3383 	}
3384 	return kvm_vm_ioctl_check_extension(kvm, arg);
3385 }
3386 
kvm_vm_ioctl_enable_cap(struct kvm * kvm,struct kvm_enable_cap * cap)3387 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3388 						  struct kvm_enable_cap *cap)
3389 {
3390 	return -EINVAL;
3391 }
3392 
kvm_vm_ioctl_enable_cap_generic(struct kvm * kvm,struct kvm_enable_cap * cap)3393 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
3394 					   struct kvm_enable_cap *cap)
3395 {
3396 	switch (cap->cap) {
3397 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3398 	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
3399 		if (cap->flags || (cap->args[0] & ~1))
3400 			return -EINVAL;
3401 		kvm->manual_dirty_log_protect = cap->args[0];
3402 		return 0;
3403 #endif
3404 	default:
3405 		return kvm_vm_ioctl_enable_cap(kvm, cap);
3406 	}
3407 }
3408 
kvm_vm_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)3409 static long kvm_vm_ioctl(struct file *filp,
3410 			   unsigned int ioctl, unsigned long arg)
3411 {
3412 	struct kvm *kvm = filp->private_data;
3413 	void __user *argp = (void __user *)arg;
3414 	int r;
3415 
3416 	if (kvm->mm != current->mm || kvm->vm_bugged)
3417 		return -EIO;
3418 	switch (ioctl) {
3419 	case KVM_CREATE_VCPU:
3420 		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3421 		break;
3422 	case KVM_ENABLE_CAP: {
3423 		struct kvm_enable_cap cap;
3424 
3425 		r = -EFAULT;
3426 		if (copy_from_user(&cap, argp, sizeof(cap)))
3427 			goto out;
3428 		r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
3429 		break;
3430 	}
3431 	case KVM_SET_USER_MEMORY_REGION: {
3432 		struct kvm_userspace_memory_region kvm_userspace_mem;
3433 
3434 		r = -EFAULT;
3435 		if (copy_from_user(&kvm_userspace_mem, argp,
3436 						sizeof(kvm_userspace_mem)))
3437 			goto out;
3438 
3439 		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
3440 		break;
3441 	}
3442 	case KVM_GET_DIRTY_LOG: {
3443 		struct kvm_dirty_log log;
3444 
3445 		r = -EFAULT;
3446 		if (copy_from_user(&log, argp, sizeof(log)))
3447 			goto out;
3448 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3449 		break;
3450 	}
3451 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3452 	case KVM_CLEAR_DIRTY_LOG: {
3453 		struct kvm_clear_dirty_log log;
3454 
3455 		r = -EFAULT;
3456 		if (copy_from_user(&log, argp, sizeof(log)))
3457 			goto out;
3458 		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
3459 		break;
3460 	}
3461 #endif
3462 #ifdef CONFIG_KVM_MMIO
3463 	case KVM_REGISTER_COALESCED_MMIO: {
3464 		struct kvm_coalesced_mmio_zone zone;
3465 
3466 		r = -EFAULT;
3467 		if (copy_from_user(&zone, argp, sizeof(zone)))
3468 			goto out;
3469 		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
3470 		break;
3471 	}
3472 	case KVM_UNREGISTER_COALESCED_MMIO: {
3473 		struct kvm_coalesced_mmio_zone zone;
3474 
3475 		r = -EFAULT;
3476 		if (copy_from_user(&zone, argp, sizeof(zone)))
3477 			goto out;
3478 		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
3479 		break;
3480 	}
3481 #endif
3482 	case KVM_IRQFD: {
3483 		struct kvm_irqfd data;
3484 
3485 		r = -EFAULT;
3486 		if (copy_from_user(&data, argp, sizeof(data)))
3487 			goto out;
3488 		r = kvm_irqfd(kvm, &data);
3489 		break;
3490 	}
3491 	case KVM_IOEVENTFD: {
3492 		struct kvm_ioeventfd data;
3493 
3494 		r = -EFAULT;
3495 		if (copy_from_user(&data, argp, sizeof(data)))
3496 			goto out;
3497 		r = kvm_ioeventfd(kvm, &data);
3498 		break;
3499 	}
3500 #ifdef CONFIG_HAVE_KVM_MSI
3501 	case KVM_SIGNAL_MSI: {
3502 		struct kvm_msi msi;
3503 
3504 		r = -EFAULT;
3505 		if (copy_from_user(&msi, argp, sizeof(msi)))
3506 			goto out;
3507 		r = kvm_send_userspace_msi(kvm, &msi);
3508 		break;
3509 	}
3510 #endif
3511 #ifdef __KVM_HAVE_IRQ_LINE
3512 	case KVM_IRQ_LINE_STATUS:
3513 	case KVM_IRQ_LINE: {
3514 		struct kvm_irq_level irq_event;
3515 
3516 		r = -EFAULT;
3517 		if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
3518 			goto out;
3519 
3520 		r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
3521 					ioctl == KVM_IRQ_LINE_STATUS);
3522 		if (r)
3523 			goto out;
3524 
3525 		r = -EFAULT;
3526 		if (ioctl == KVM_IRQ_LINE_STATUS) {
3527 			if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
3528 				goto out;
3529 		}
3530 
3531 		r = 0;
3532 		break;
3533 	}
3534 #endif
3535 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3536 	case KVM_SET_GSI_ROUTING: {
3537 		struct kvm_irq_routing routing;
3538 		struct kvm_irq_routing __user *urouting;
3539 		struct kvm_irq_routing_entry *entries = NULL;
3540 
3541 		r = -EFAULT;
3542 		if (copy_from_user(&routing, argp, sizeof(routing)))
3543 			goto out;
3544 		r = -EINVAL;
3545 		if (!kvm_arch_can_set_irq_routing(kvm))
3546 			goto out;
3547 		if (routing.nr > KVM_MAX_IRQ_ROUTES)
3548 			goto out;
3549 		if (routing.flags)
3550 			goto out;
3551 		if (routing.nr) {
3552 			r = -ENOMEM;
3553 			entries = vmalloc(array_size(sizeof(*entries),
3554 						     routing.nr));
3555 			if (!entries)
3556 				goto out;
3557 			r = -EFAULT;
3558 			urouting = argp;
3559 			if (copy_from_user(entries, urouting->entries,
3560 					   routing.nr * sizeof(*entries)))
3561 				goto out_free_irq_routing;
3562 		}
3563 		r = kvm_set_irq_routing(kvm, entries, routing.nr,
3564 					routing.flags);
3565 out_free_irq_routing:
3566 		vfree(entries);
3567 		break;
3568 	}
3569 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
3570 	case KVM_CREATE_DEVICE: {
3571 		struct kvm_create_device cd;
3572 
3573 		r = -EFAULT;
3574 		if (copy_from_user(&cd, argp, sizeof(cd)))
3575 			goto out;
3576 
3577 		r = kvm_ioctl_create_device(kvm, &cd);
3578 		if (r)
3579 			goto out;
3580 
3581 		r = -EFAULT;
3582 		if (copy_to_user(argp, &cd, sizeof(cd)))
3583 			goto out;
3584 
3585 		r = 0;
3586 		break;
3587 	}
3588 	case KVM_CHECK_EXTENSION:
3589 		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
3590 		break;
3591 	default:
3592 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
3593 	}
3594 out:
3595 	return r;
3596 }
3597 
3598 #ifdef CONFIG_KVM_COMPAT
3599 struct compat_kvm_dirty_log {
3600 	__u32 slot;
3601 	__u32 padding1;
3602 	union {
3603 		compat_uptr_t dirty_bitmap; /* one bit per page */
3604 		__u64 padding2;
3605 	};
3606 };
3607 
3608 struct compat_kvm_clear_dirty_log {
3609 	__u32 slot;
3610 	__u32 num_pages;
3611 	__u64 first_page;
3612 	union {
3613 		compat_uptr_t dirty_bitmap; /* one bit per page */
3614 		__u64 padding2;
3615 	};
3616 };
3617 
kvm_vm_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)3618 static long kvm_vm_compat_ioctl(struct file *filp,
3619 			   unsigned int ioctl, unsigned long arg)
3620 {
3621 	struct kvm *kvm = filp->private_data;
3622 	int r;
3623 
3624 	if (kvm->mm != current->mm || kvm->vm_bugged)
3625 		return -EIO;
3626 	switch (ioctl) {
3627 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3628 	case KVM_CLEAR_DIRTY_LOG: {
3629 		struct compat_kvm_clear_dirty_log compat_log;
3630 		struct kvm_clear_dirty_log log;
3631 
3632 		if (copy_from_user(&compat_log, (void __user *)arg,
3633 				   sizeof(compat_log)))
3634 			return -EFAULT;
3635 		log.slot	 = compat_log.slot;
3636 		log.num_pages	 = compat_log.num_pages;
3637 		log.first_page	 = compat_log.first_page;
3638 		log.padding2	 = compat_log.padding2;
3639 		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
3640 
3641 		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
3642 		break;
3643 	}
3644 #endif
3645 	case KVM_GET_DIRTY_LOG: {
3646 		struct compat_kvm_dirty_log compat_log;
3647 		struct kvm_dirty_log log;
3648 
3649 		if (copy_from_user(&compat_log, (void __user *)arg,
3650 				   sizeof(compat_log)))
3651 			return -EFAULT;
3652 		log.slot	 = compat_log.slot;
3653 		log.padding1	 = compat_log.padding1;
3654 		log.padding2	 = compat_log.padding2;
3655 		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
3656 
3657 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3658 		break;
3659 	}
3660 	default:
3661 		r = kvm_vm_ioctl(filp, ioctl, arg);
3662 	}
3663 	return r;
3664 }
3665 #endif
3666 
3667 static struct file_operations kvm_vm_fops = {
3668 	.release        = kvm_vm_release,
3669 	.unlocked_ioctl = kvm_vm_ioctl,
3670 	.llseek		= noop_llseek,
3671 	KVM_COMPAT(kvm_vm_compat_ioctl),
3672 };
3673 
kvm_dev_ioctl_create_vm(unsigned long type)3674 static int kvm_dev_ioctl_create_vm(unsigned long type)
3675 {
3676 	int r;
3677 	struct kvm *kvm;
3678 	struct file *file;
3679 
3680 	kvm = kvm_create_vm(type);
3681 	if (IS_ERR(kvm))
3682 		return PTR_ERR(kvm);
3683 #ifdef CONFIG_KVM_MMIO
3684 	r = kvm_coalesced_mmio_init(kvm);
3685 	if (r < 0)
3686 		goto put_kvm;
3687 #endif
3688 	r = get_unused_fd_flags(O_CLOEXEC);
3689 	if (r < 0)
3690 		goto put_kvm;
3691 
3692 	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
3693 	if (IS_ERR(file)) {
3694 		put_unused_fd(r);
3695 		r = PTR_ERR(file);
3696 		goto put_kvm;
3697 	}
3698 
3699 	/*
3700 	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
3701 	 * already set, with ->release() being kvm_vm_release().  In error
3702 	 * cases it will be called by the final fput(file) and will take
3703 	 * care of doing kvm_put_kvm(kvm).
3704 	 */
3705 	if (kvm_create_vm_debugfs(kvm, r) < 0) {
3706 		put_unused_fd(r);
3707 		fput(file);
3708 		return -ENOMEM;
3709 	}
3710 	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
3711 
3712 	fd_install(r, file);
3713 	return r;
3714 
3715 put_kvm:
3716 	kvm_put_kvm(kvm);
3717 	return r;
3718 }
3719 
kvm_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)3720 static long kvm_dev_ioctl(struct file *filp,
3721 			  unsigned int ioctl, unsigned long arg)
3722 {
3723 	long r = -EINVAL;
3724 
3725 	switch (ioctl) {
3726 	case KVM_GET_API_VERSION:
3727 		if (arg)
3728 			goto out;
3729 		r = KVM_API_VERSION;
3730 		break;
3731 	case KVM_CREATE_VM:
3732 		r = kvm_dev_ioctl_create_vm(arg);
3733 		break;
3734 	case KVM_CHECK_EXTENSION:
3735 		r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
3736 		break;
3737 	case KVM_GET_VCPU_MMAP_SIZE:
3738 		if (arg)
3739 			goto out;
3740 		r = PAGE_SIZE;     /* struct kvm_run */
3741 #ifdef CONFIG_X86
3742 		r += PAGE_SIZE;    /* pio data page */
3743 #endif
3744 #ifdef CONFIG_KVM_MMIO
3745 		r += PAGE_SIZE;    /* coalesced mmio ring page */
3746 #endif
3747 		break;
3748 	case KVM_TRACE_ENABLE:
3749 	case KVM_TRACE_PAUSE:
3750 	case KVM_TRACE_DISABLE:
3751 		r = -EOPNOTSUPP;
3752 		break;
3753 	default:
3754 		return kvm_arch_dev_ioctl(filp, ioctl, arg);
3755 	}
3756 out:
3757 	return r;
3758 }
3759 
3760 static struct file_operations kvm_chardev_ops = {
3761 	.unlocked_ioctl = kvm_dev_ioctl,
3762 	.llseek		= noop_llseek,
3763 	KVM_COMPAT(kvm_dev_ioctl),
3764 };
3765 
3766 static struct miscdevice kvm_dev = {
3767 	KVM_MINOR,
3768 	"kvm",
3769 	&kvm_chardev_ops,
3770 };
3771 
hardware_enable_nolock(void * junk)3772 static void hardware_enable_nolock(void *junk)
3773 {
3774 	int cpu = raw_smp_processor_id();
3775 	int r;
3776 
3777 	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
3778 		return;
3779 
3780 	cpumask_set_cpu(cpu, cpus_hardware_enabled);
3781 
3782 	r = kvm_arch_hardware_enable();
3783 
3784 	if (r) {
3785 		cpumask_clear_cpu(cpu, cpus_hardware_enabled);
3786 		atomic_inc(&hardware_enable_failed);
3787 		pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
3788 	}
3789 }
3790 
kvm_starting_cpu(unsigned int cpu)3791 static int kvm_starting_cpu(unsigned int cpu)
3792 {
3793 	raw_spin_lock(&kvm_count_lock);
3794 	if (kvm_usage_count)
3795 		hardware_enable_nolock(NULL);
3796 	raw_spin_unlock(&kvm_count_lock);
3797 	return 0;
3798 }
3799 
hardware_disable_nolock(void * junk)3800 static void hardware_disable_nolock(void *junk)
3801 {
3802 	int cpu = raw_smp_processor_id();
3803 
3804 	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
3805 		return;
3806 	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
3807 	kvm_arch_hardware_disable();
3808 }
3809 
kvm_dying_cpu(unsigned int cpu)3810 static int kvm_dying_cpu(unsigned int cpu)
3811 {
3812 	raw_spin_lock(&kvm_count_lock);
3813 	if (kvm_usage_count)
3814 		hardware_disable_nolock(NULL);
3815 	raw_spin_unlock(&kvm_count_lock);
3816 	return 0;
3817 }
3818 
hardware_disable_all_nolock(void)3819 static void hardware_disable_all_nolock(void)
3820 {
3821 	BUG_ON(!kvm_usage_count);
3822 
3823 	kvm_usage_count--;
3824 	if (!kvm_usage_count)
3825 		on_each_cpu(hardware_disable_nolock, NULL, 1);
3826 }
3827 
hardware_disable_all(void)3828 static void hardware_disable_all(void)
3829 {
3830 	raw_spin_lock(&kvm_count_lock);
3831 	hardware_disable_all_nolock();
3832 	raw_spin_unlock(&kvm_count_lock);
3833 }
3834 
hardware_enable_all(void)3835 static int hardware_enable_all(void)
3836 {
3837 	int r = 0;
3838 
3839 	raw_spin_lock(&kvm_count_lock);
3840 
3841 	kvm_usage_count++;
3842 	if (kvm_usage_count == 1) {
3843 		atomic_set(&hardware_enable_failed, 0);
3844 		on_each_cpu(hardware_enable_nolock, NULL, 1);
3845 
3846 		if (atomic_read(&hardware_enable_failed)) {
3847 			hardware_disable_all_nolock();
3848 			r = -EBUSY;
3849 		}
3850 	}
3851 
3852 	raw_spin_unlock(&kvm_count_lock);
3853 
3854 	return r;
3855 }
3856 
kvm_reboot(struct notifier_block * notifier,unsigned long val,void * v)3857 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3858 		      void *v)
3859 {
3860 	/*
3861 	 * Some (well, at least mine) BIOSes hang on reboot if
3862 	 * in vmx root mode.
3863 	 *
3864 	 * And Intel TXT required VMX off for all cpu when system shutdown.
3865 	 */
3866 	pr_info("kvm: exiting hardware virtualization\n");
3867 	kvm_rebooting = true;
3868 	on_each_cpu(hardware_disable_nolock, NULL, 1);
3869 	return NOTIFY_OK;
3870 }
3871 
3872 static struct notifier_block kvm_reboot_notifier = {
3873 	.notifier_call = kvm_reboot,
3874 	.priority = 0,
3875 };
3876 
kvm_io_bus_destroy(struct kvm_io_bus * bus)3877 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3878 {
3879 	int i;
3880 
3881 	for (i = 0; i < bus->dev_count; i++) {
3882 		struct kvm_io_device *pos = bus->range[i].dev;
3883 
3884 		kvm_iodevice_destructor(pos);
3885 	}
3886 	kfree(bus);
3887 }
3888 
kvm_io_bus_cmp(const struct kvm_io_range * r1,const struct kvm_io_range * r2)3889 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
3890 				 const struct kvm_io_range *r2)
3891 {
3892 	gpa_t addr1 = r1->addr;
3893 	gpa_t addr2 = r2->addr;
3894 
3895 	if (addr1 < addr2)
3896 		return -1;
3897 
3898 	/* If r2->len == 0, match the exact address.  If r2->len != 0,
3899 	 * accept any overlapping write.  Any order is acceptable for
3900 	 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
3901 	 * we process all of them.
3902 	 */
3903 	if (r2->len) {
3904 		addr1 += r1->len;
3905 		addr2 += r2->len;
3906 	}
3907 
3908 	if (addr1 > addr2)
3909 		return 1;
3910 
3911 	return 0;
3912 }
3913 
kvm_io_bus_sort_cmp(const void * p1,const void * p2)3914 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
3915 {
3916 	return kvm_io_bus_cmp(p1, p2);
3917 }
3918 
kvm_io_bus_get_first_dev(struct kvm_io_bus * bus,gpa_t addr,int len)3919 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
3920 			     gpa_t addr, int len)
3921 {
3922 	struct kvm_io_range *range, key;
3923 	int off;
3924 
3925 	key = (struct kvm_io_range) {
3926 		.addr = addr,
3927 		.len = len,
3928 	};
3929 
3930 	range = bsearch(&key, bus->range, bus->dev_count,
3931 			sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
3932 	if (range == NULL)
3933 		return -ENOENT;
3934 
3935 	off = range - bus->range;
3936 
3937 	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
3938 		off--;
3939 
3940 	return off;
3941 }
3942 
__kvm_io_bus_write(struct kvm_vcpu * vcpu,struct kvm_io_bus * bus,struct kvm_io_range * range,const void * val)3943 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
3944 			      struct kvm_io_range *range, const void *val)
3945 {
3946 	int idx;
3947 
3948 	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
3949 	if (idx < 0)
3950 		return -EOPNOTSUPP;
3951 
3952 	while (idx < bus->dev_count &&
3953 		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
3954 		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
3955 					range->len, val))
3956 			return idx;
3957 		idx++;
3958 	}
3959 
3960 	return -EOPNOTSUPP;
3961 }
3962 
3963 /* kvm_io_bus_write - called under kvm->slots_lock */
kvm_io_bus_write(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,const void * val)3964 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
3965 		     int len, const void *val)
3966 {
3967 	struct kvm_io_bus *bus;
3968 	struct kvm_io_range range;
3969 	int r;
3970 
3971 	range = (struct kvm_io_range) {
3972 		.addr = addr,
3973 		.len = len,
3974 	};
3975 
3976 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
3977 	if (!bus)
3978 		return -ENOMEM;
3979 	r = __kvm_io_bus_write(vcpu, bus, &range, val);
3980 	return r < 0 ? r : 0;
3981 }
3982 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
3983 
3984 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
kvm_io_bus_write_cookie(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,const void * val,long cookie)3985 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
3986 			    gpa_t addr, int len, const void *val, long cookie)
3987 {
3988 	struct kvm_io_bus *bus;
3989 	struct kvm_io_range range;
3990 
3991 	range = (struct kvm_io_range) {
3992 		.addr = addr,
3993 		.len = len,
3994 	};
3995 
3996 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
3997 	if (!bus)
3998 		return -ENOMEM;
3999 
4000 	/* First try the device referenced by cookie. */
4001 	if ((cookie >= 0) && (cookie < bus->dev_count) &&
4002 	    (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4003 		if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4004 					val))
4005 			return cookie;
4006 
4007 	/*
4008 	 * cookie contained garbage; fall back to search and return the
4009 	 * correct cookie value.
4010 	 */
4011 	return __kvm_io_bus_write(vcpu, bus, &range, val);
4012 }
4013 
__kvm_io_bus_read(struct kvm_vcpu * vcpu,struct kvm_io_bus * bus,struct kvm_io_range * range,void * val)4014 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4015 			     struct kvm_io_range *range, void *val)
4016 {
4017 	int idx;
4018 
4019 	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4020 	if (idx < 0)
4021 		return -EOPNOTSUPP;
4022 
4023 	while (idx < bus->dev_count &&
4024 		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4025 		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4026 				       range->len, val))
4027 			return idx;
4028 		idx++;
4029 	}
4030 
4031 	return -EOPNOTSUPP;
4032 }
4033 
4034 /* kvm_io_bus_read - called under kvm->slots_lock */
kvm_io_bus_read(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,void * val)4035 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4036 		    int len, void *val)
4037 {
4038 	struct kvm_io_bus *bus;
4039 	struct kvm_io_range range;
4040 	int r;
4041 
4042 	range = (struct kvm_io_range) {
4043 		.addr = addr,
4044 		.len = len,
4045 	};
4046 
4047 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4048 	if (!bus)
4049 		return -ENOMEM;
4050 	r = __kvm_io_bus_read(vcpu, bus, &range, val);
4051 	return r < 0 ? r : 0;
4052 }
4053 
4054 /* Caller must hold slots_lock. */
kvm_io_bus_register_dev(struct kvm * kvm,enum kvm_bus bus_idx,gpa_t addr,int len,struct kvm_io_device * dev)4055 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4056 			    int len, struct kvm_io_device *dev)
4057 {
4058 	int i;
4059 	struct kvm_io_bus *new_bus, *bus;
4060 	struct kvm_io_range range;
4061 
4062 	bus = kvm_get_bus(kvm, bus_idx);
4063 	if (!bus)
4064 		return -ENOMEM;
4065 
4066 	/* exclude ioeventfd which is limited by maximum fd */
4067 	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
4068 		return -ENOSPC;
4069 
4070 	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
4071 			  GFP_KERNEL_ACCOUNT);
4072 	if (!new_bus)
4073 		return -ENOMEM;
4074 
4075 	range = (struct kvm_io_range) {
4076 		.addr = addr,
4077 		.len = len,
4078 		.dev = dev,
4079 	};
4080 
4081 	for (i = 0; i < bus->dev_count; i++)
4082 		if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
4083 			break;
4084 
4085 	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4086 	new_bus->dev_count++;
4087 	new_bus->range[i] = range;
4088 	memcpy(new_bus->range + i + 1, bus->range + i,
4089 		(bus->dev_count - i) * sizeof(struct kvm_io_range));
4090 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4091 	synchronize_srcu_expedited(&kvm->srcu);
4092 	kfree(bus);
4093 
4094 	return 0;
4095 }
4096 
4097 /* Caller must hold slots_lock. */
kvm_io_bus_unregister_dev(struct kvm * kvm,enum kvm_bus bus_idx,struct kvm_io_device * dev)4098 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4099 			      struct kvm_io_device *dev)
4100 {
4101 	int i, j;
4102 	struct kvm_io_bus *new_bus, *bus;
4103 
4104 	bus = kvm_get_bus(kvm, bus_idx);
4105 	if (!bus)
4106 		return 0;
4107 
4108 	for (i = 0; i < bus->dev_count; i++)
4109 		if (bus->range[i].dev == dev) {
4110 			break;
4111 		}
4112 
4113 	if (i == bus->dev_count)
4114 		return 0;
4115 
4116 	new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
4117 			  GFP_KERNEL_ACCOUNT);
4118 	if (new_bus) {
4119 		memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4120 		new_bus->dev_count--;
4121 		memcpy(new_bus->range + i, bus->range + i + 1,
4122 		       (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
4123 	} else {
4124 		pr_err("kvm: failed to shrink bus, removing it completely\n");
4125 		for (j = 0; j < bus->dev_count; j++) {
4126 			if (j == i)
4127 				continue;
4128 			kvm_iodevice_destructor(bus->range[j].dev);
4129 		}
4130 	}
4131 
4132 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4133 	synchronize_srcu_expedited(&kvm->srcu);
4134 	kfree(bus);
4135 	return new_bus ? 0 : -ENOMEM;
4136 }
4137 
kvm_io_bus_get_dev(struct kvm * kvm,enum kvm_bus bus_idx,gpa_t addr)4138 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4139 					 gpa_t addr)
4140 {
4141 	struct kvm_io_bus *bus;
4142 	int dev_idx, srcu_idx;
4143 	struct kvm_io_device *iodev = NULL;
4144 
4145 	srcu_idx = srcu_read_lock(&kvm->srcu);
4146 
4147 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
4148 	if (!bus)
4149 		goto out_unlock;
4150 
4151 	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
4152 	if (dev_idx < 0)
4153 		goto out_unlock;
4154 
4155 	iodev = bus->range[dev_idx].dev;
4156 
4157 out_unlock:
4158 	srcu_read_unlock(&kvm->srcu, srcu_idx);
4159 
4160 	return iodev;
4161 }
4162 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
4163 
kvm_debugfs_open(struct inode * inode,struct file * file,int (* get)(void *,u64 *),int (* set)(void *,u64),const char * fmt)4164 static int kvm_debugfs_open(struct inode *inode, struct file *file,
4165 			   int (*get)(void *, u64 *), int (*set)(void *, u64),
4166 			   const char *fmt)
4167 {
4168 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
4169 					  inode->i_private;
4170 
4171 	/* The debugfs files are a reference to the kvm struct which
4172 	 * is still valid when kvm_destroy_vm is called.
4173 	 * To avoid the race between open and the removal of the debugfs
4174 	 * directory we test against the users count.
4175 	 */
4176 	if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
4177 		return -ENOENT;
4178 
4179 	if (simple_attr_open(inode, file, get,
4180 			     stat_data->mode & S_IWUGO ? set : NULL,
4181 			     fmt)) {
4182 		kvm_put_kvm(stat_data->kvm);
4183 		return -ENOMEM;
4184 	}
4185 
4186 	return 0;
4187 }
4188 
kvm_debugfs_release(struct inode * inode,struct file * file)4189 static int kvm_debugfs_release(struct inode *inode, struct file *file)
4190 {
4191 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
4192 					  inode->i_private;
4193 
4194 	simple_attr_release(inode, file);
4195 	kvm_put_kvm(stat_data->kvm);
4196 
4197 	return 0;
4198 }
4199 
vm_stat_get_per_vm(void * data,u64 * val)4200 static int vm_stat_get_per_vm(void *data, u64 *val)
4201 {
4202 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4203 
4204 	*val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
4205 
4206 	return 0;
4207 }
4208 
vm_stat_clear_per_vm(void * data,u64 val)4209 static int vm_stat_clear_per_vm(void *data, u64 val)
4210 {
4211 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4212 
4213 	if (val)
4214 		return -EINVAL;
4215 
4216 	*(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0;
4217 
4218 	return 0;
4219 }
4220 
vm_stat_get_per_vm_open(struct inode * inode,struct file * file)4221 static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
4222 {
4223 	__simple_attr_check_format("%llu\n", 0ull);
4224 	return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
4225 				vm_stat_clear_per_vm, "%llu\n");
4226 }
4227 
4228 static const struct file_operations vm_stat_get_per_vm_fops = {
4229 	.owner   = THIS_MODULE,
4230 	.open    = vm_stat_get_per_vm_open,
4231 	.release = kvm_debugfs_release,
4232 	.read    = simple_attr_read,
4233 	.write   = simple_attr_write,
4234 	.llseek  = no_llseek,
4235 };
4236 
vcpu_stat_get_per_vm(void * data,u64 * val)4237 static int vcpu_stat_get_per_vm(void *data, u64 *val)
4238 {
4239 	int i;
4240 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4241 	struct kvm_vcpu *vcpu;
4242 
4243 	*val = 0;
4244 
4245 	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4246 		*val += *(u64 *)((void *)vcpu + stat_data->offset);
4247 
4248 	return 0;
4249 }
4250 
vcpu_stat_clear_per_vm(void * data,u64 val)4251 static int vcpu_stat_clear_per_vm(void *data, u64 val)
4252 {
4253 	int i;
4254 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4255 	struct kvm_vcpu *vcpu;
4256 
4257 	if (val)
4258 		return -EINVAL;
4259 
4260 	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4261 		*(u64 *)((void *)vcpu + stat_data->offset) = 0;
4262 
4263 	return 0;
4264 }
4265 
vcpu_stat_get_per_vm_open(struct inode * inode,struct file * file)4266 static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
4267 {
4268 	__simple_attr_check_format("%llu\n", 0ull);
4269 	return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
4270 				 vcpu_stat_clear_per_vm, "%llu\n");
4271 }
4272 
4273 static const struct file_operations vcpu_stat_get_per_vm_fops = {
4274 	.owner   = THIS_MODULE,
4275 	.open    = vcpu_stat_get_per_vm_open,
4276 	.release = kvm_debugfs_release,
4277 	.read    = simple_attr_read,
4278 	.write   = simple_attr_write,
4279 	.llseek  = no_llseek,
4280 };
4281 
4282 static const struct file_operations *stat_fops_per_vm[] = {
4283 	[KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
4284 	[KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
4285 };
4286 
vm_stat_get(void * _offset,u64 * val)4287 static int vm_stat_get(void *_offset, u64 *val)
4288 {
4289 	unsigned offset = (long)_offset;
4290 	struct kvm *kvm;
4291 	struct kvm_stat_data stat_tmp = {.offset = offset};
4292 	u64 tmp_val;
4293 
4294 	*val = 0;
4295 	mutex_lock(&kvm_lock);
4296 	list_for_each_entry(kvm, &vm_list, vm_list) {
4297 		stat_tmp.kvm = kvm;
4298 		vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
4299 		*val += tmp_val;
4300 	}
4301 	mutex_unlock(&kvm_lock);
4302 	return 0;
4303 }
4304 
vm_stat_clear(void * _offset,u64 val)4305 static int vm_stat_clear(void *_offset, u64 val)
4306 {
4307 	unsigned offset = (long)_offset;
4308 	struct kvm *kvm;
4309 	struct kvm_stat_data stat_tmp = {.offset = offset};
4310 
4311 	if (val)
4312 		return -EINVAL;
4313 
4314 	mutex_lock(&kvm_lock);
4315 	list_for_each_entry(kvm, &vm_list, vm_list) {
4316 		stat_tmp.kvm = kvm;
4317 		vm_stat_clear_per_vm((void *)&stat_tmp, 0);
4318 	}
4319 	mutex_unlock(&kvm_lock);
4320 
4321 	return 0;
4322 }
4323 
4324 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
4325 
vcpu_stat_get(void * _offset,u64 * val)4326 static int vcpu_stat_get(void *_offset, u64 *val)
4327 {
4328 	unsigned offset = (long)_offset;
4329 	struct kvm *kvm;
4330 	struct kvm_stat_data stat_tmp = {.offset = offset};
4331 	u64 tmp_val;
4332 
4333 	*val = 0;
4334 	mutex_lock(&kvm_lock);
4335 	list_for_each_entry(kvm, &vm_list, vm_list) {
4336 		stat_tmp.kvm = kvm;
4337 		vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
4338 		*val += tmp_val;
4339 	}
4340 	mutex_unlock(&kvm_lock);
4341 	return 0;
4342 }
4343 
vcpu_stat_clear(void * _offset,u64 val)4344 static int vcpu_stat_clear(void *_offset, u64 val)
4345 {
4346 	unsigned offset = (long)_offset;
4347 	struct kvm *kvm;
4348 	struct kvm_stat_data stat_tmp = {.offset = offset};
4349 
4350 	if (val)
4351 		return -EINVAL;
4352 
4353 	mutex_lock(&kvm_lock);
4354 	list_for_each_entry(kvm, &vm_list, vm_list) {
4355 		stat_tmp.kvm = kvm;
4356 		vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
4357 	}
4358 	mutex_unlock(&kvm_lock);
4359 
4360 	return 0;
4361 }
4362 
4363 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
4364 			"%llu\n");
4365 
4366 static const struct file_operations *stat_fops[] = {
4367 	[KVM_STAT_VCPU] = &vcpu_stat_fops,
4368 	[KVM_STAT_VM]   = &vm_stat_fops,
4369 };
4370 
kvm_uevent_notify_change(unsigned int type,struct kvm * kvm)4371 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
4372 {
4373 	struct kobj_uevent_env *env;
4374 	unsigned long long created, active;
4375 
4376 	if (!kvm_dev.this_device || !kvm)
4377 		return;
4378 
4379 	mutex_lock(&kvm_lock);
4380 	if (type == KVM_EVENT_CREATE_VM) {
4381 		kvm_createvm_count++;
4382 		kvm_active_vms++;
4383 	} else if (type == KVM_EVENT_DESTROY_VM) {
4384 		kvm_active_vms--;
4385 	}
4386 	created = kvm_createvm_count;
4387 	active = kvm_active_vms;
4388 	mutex_unlock(&kvm_lock);
4389 
4390 	env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
4391 	if (!env)
4392 		return;
4393 
4394 	add_uevent_var(env, "CREATED=%llu", created);
4395 	add_uevent_var(env, "COUNT=%llu", active);
4396 
4397 	if (type == KVM_EVENT_CREATE_VM) {
4398 		add_uevent_var(env, "EVENT=create");
4399 		kvm->userspace_pid = task_pid_nr(current);
4400 	} else if (type == KVM_EVENT_DESTROY_VM) {
4401 		add_uevent_var(env, "EVENT=destroy");
4402 	}
4403 	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
4404 
4405 	if (kvm->debugfs_dentry) {
4406 		char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
4407 
4408 		if (p) {
4409 			tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
4410 			if (!IS_ERR(tmp))
4411 				add_uevent_var(env, "STATS_PATH=%s", tmp);
4412 			kfree(p);
4413 		}
4414 	}
4415 	/* no need for checks, since we are adding at most only 5 keys */
4416 	env->envp[env->envp_idx++] = NULL;
4417 	kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
4418 	kfree(env);
4419 }
4420 
kvm_init_debug(void)4421 static void kvm_init_debug(void)
4422 {
4423 	struct kvm_stats_debugfs_item *p;
4424 
4425 	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
4426 
4427 	kvm_debugfs_num_entries = 0;
4428 	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
4429 		int mode = p->mode ? p->mode : 0644;
4430 		debugfs_create_file(p->name, mode, kvm_debugfs_dir,
4431 				    (void *)(long)p->offset,
4432 				    stat_fops[p->kind]);
4433 	}
4434 }
4435 
kvm_suspend(void)4436 static int kvm_suspend(void)
4437 {
4438 	if (kvm_usage_count)
4439 		hardware_disable_nolock(NULL);
4440 	return 0;
4441 }
4442 
kvm_resume(void)4443 static void kvm_resume(void)
4444 {
4445 	if (kvm_usage_count) {
4446 #ifdef CONFIG_LOCKDEP
4447 		WARN_ON(lockdep_is_held(&kvm_count_lock));
4448 #endif
4449 		hardware_enable_nolock(NULL);
4450 	}
4451 }
4452 
4453 static struct syscore_ops kvm_syscore_ops = {
4454 	.suspend = kvm_suspend,
4455 	.resume = kvm_resume,
4456 };
4457 
4458 static inline
preempt_notifier_to_vcpu(struct preempt_notifier * pn)4459 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
4460 {
4461 	return container_of(pn, struct kvm_vcpu, preempt_notifier);
4462 }
4463 
kvm_sched_in(struct preempt_notifier * pn,int cpu)4464 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
4465 {
4466 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4467 
4468 	WRITE_ONCE(vcpu->preempted, false);
4469 	WRITE_ONCE(vcpu->ready, false);
4470 
4471 	kvm_arch_sched_in(vcpu, cpu);
4472 
4473 	kvm_arch_vcpu_load(vcpu, cpu);
4474 }
4475 
kvm_sched_out(struct preempt_notifier * pn,struct task_struct * next)4476 static void kvm_sched_out(struct preempt_notifier *pn,
4477 			  struct task_struct *next)
4478 {
4479 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4480 
4481 	if (current->state == TASK_RUNNING) {
4482 		WRITE_ONCE(vcpu->preempted, true);
4483 		WRITE_ONCE(vcpu->ready, true);
4484 	}
4485 	kvm_arch_vcpu_put(vcpu);
4486 }
4487 
check_processor_compat(void * rtn)4488 static void check_processor_compat(void *rtn)
4489 {
4490 	*(int *)rtn = kvm_arch_check_processor_compat();
4491 }
4492 
kvm_init(void * opaque,unsigned vcpu_size,unsigned vcpu_align,struct module * module)4493 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
4494 		  struct module *module)
4495 {
4496 	int r;
4497 	int cpu;
4498 
4499 	r = kvm_arch_init(opaque);
4500 	if (r)
4501 		goto out_fail;
4502 
4503 	/*
4504 	 * kvm_arch_init makes sure there's at most one caller
4505 	 * for architectures that support multiple implementations,
4506 	 * like intel and amd on x86.
4507 	 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
4508 	 * conflicts in case kvm is already setup for another implementation.
4509 	 */
4510 	r = kvm_irqfd_init();
4511 	if (r)
4512 		goto out_irqfd;
4513 
4514 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
4515 		r = -ENOMEM;
4516 		goto out_free_0;
4517 	}
4518 
4519 	r = kvm_arch_hardware_setup();
4520 	if (r < 0)
4521 		goto out_free_0a;
4522 
4523 	for_each_online_cpu(cpu) {
4524 		smp_call_function_single(cpu, check_processor_compat, &r, 1);
4525 		if (r < 0)
4526 			goto out_free_1;
4527 	}
4528 
4529 	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
4530 				      kvm_starting_cpu, kvm_dying_cpu);
4531 	if (r)
4532 		goto out_free_2;
4533 	register_reboot_notifier(&kvm_reboot_notifier);
4534 
4535 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
4536 	if (!vcpu_align)
4537 		vcpu_align = __alignof__(struct kvm_vcpu);
4538 	kvm_vcpu_cache =
4539 		kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
4540 					   SLAB_ACCOUNT,
4541 					   offsetof(struct kvm_vcpu, arch),
4542 					   sizeof_field(struct kvm_vcpu, arch),
4543 					   NULL);
4544 	if (!kvm_vcpu_cache) {
4545 		r = -ENOMEM;
4546 		goto out_free_3;
4547 	}
4548 
4549 	r = kvm_async_pf_init();
4550 	if (r)
4551 		goto out_free;
4552 
4553 	kvm_chardev_ops.owner = module;
4554 	kvm_vm_fops.owner = module;
4555 	kvm_vcpu_fops.owner = module;
4556 
4557 	r = misc_register(&kvm_dev);
4558 	if (r) {
4559 		pr_err("kvm: misc device register failed\n");
4560 		goto out_unreg;
4561 	}
4562 
4563 	register_syscore_ops(&kvm_syscore_ops);
4564 
4565 	kvm_preempt_ops.sched_in = kvm_sched_in;
4566 	kvm_preempt_ops.sched_out = kvm_sched_out;
4567 
4568 	kvm_init_debug();
4569 
4570 	r = kvm_vfio_ops_init();
4571 	WARN_ON(r);
4572 
4573 	return 0;
4574 
4575 out_unreg:
4576 	kvm_async_pf_deinit();
4577 out_free:
4578 	kmem_cache_destroy(kvm_vcpu_cache);
4579 out_free_3:
4580 	unregister_reboot_notifier(&kvm_reboot_notifier);
4581 	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
4582 out_free_2:
4583 out_free_1:
4584 	kvm_arch_hardware_unsetup();
4585 out_free_0a:
4586 	free_cpumask_var(cpus_hardware_enabled);
4587 out_free_0:
4588 	kvm_irqfd_exit();
4589 out_irqfd:
4590 	kvm_arch_exit();
4591 out_fail:
4592 	return r;
4593 }
4594 EXPORT_SYMBOL_GPL(kvm_init);
4595 
kvm_exit(void)4596 void kvm_exit(void)
4597 {
4598 	debugfs_remove_recursive(kvm_debugfs_dir);
4599 	misc_deregister(&kvm_dev);
4600 	kmem_cache_destroy(kvm_vcpu_cache);
4601 	kvm_async_pf_deinit();
4602 	unregister_syscore_ops(&kvm_syscore_ops);
4603 	unregister_reboot_notifier(&kvm_reboot_notifier);
4604 	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
4605 	on_each_cpu(hardware_disable_nolock, NULL, 1);
4606 	kvm_arch_hardware_unsetup();
4607 	kvm_arch_exit();
4608 	kvm_irqfd_exit();
4609 	free_cpumask_var(cpus_hardware_enabled);
4610 	kvm_vfio_ops_exit();
4611 }
4612 EXPORT_SYMBOL_GPL(kvm_exit);
4613 
4614 struct kvm_vm_worker_thread_context {
4615 	struct kvm *kvm;
4616 	struct task_struct *parent;
4617 	struct completion init_done;
4618 	kvm_vm_thread_fn_t thread_fn;
4619 	uintptr_t data;
4620 	int err;
4621 };
4622 
kvm_vm_worker_thread(void * context)4623 static int kvm_vm_worker_thread(void *context)
4624 {
4625 	/*
4626 	 * The init_context is allocated on the stack of the parent thread, so
4627 	 * we have to locally copy anything that is needed beyond initialization
4628 	 */
4629 	struct kvm_vm_worker_thread_context *init_context = context;
4630 	struct kvm *kvm = init_context->kvm;
4631 	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
4632 	uintptr_t data = init_context->data;
4633 	int err;
4634 
4635 	err = kthread_park(current);
4636 	/* kthread_park(current) is never supposed to return an error */
4637 	WARN_ON(err != 0);
4638 	if (err)
4639 		goto init_complete;
4640 
4641 	err = cgroup_attach_task_all(init_context->parent, current);
4642 	if (err) {
4643 		kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
4644 			__func__, err);
4645 		goto init_complete;
4646 	}
4647 
4648 	set_user_nice(current, task_nice(init_context->parent));
4649 
4650 init_complete:
4651 	init_context->err = err;
4652 	complete(&init_context->init_done);
4653 	init_context = NULL;
4654 
4655 	if (err)
4656 		return err;
4657 
4658 	/* Wait to be woken up by the spawner before proceeding. */
4659 	kthread_parkme();
4660 
4661 	if (!kthread_should_stop())
4662 		err = thread_fn(kvm, data);
4663 
4664 	return err;
4665 }
4666 
kvm_vm_create_worker_thread(struct kvm * kvm,kvm_vm_thread_fn_t thread_fn,uintptr_t data,const char * name,struct task_struct ** thread_ptr)4667 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
4668 				uintptr_t data, const char *name,
4669 				struct task_struct **thread_ptr)
4670 {
4671 	struct kvm_vm_worker_thread_context init_context = {};
4672 	struct task_struct *thread;
4673 
4674 	*thread_ptr = NULL;
4675 	init_context.kvm = kvm;
4676 	init_context.parent = current;
4677 	init_context.thread_fn = thread_fn;
4678 	init_context.data = data;
4679 	init_completion(&init_context.init_done);
4680 
4681 	thread = kthread_run(kvm_vm_worker_thread, &init_context,
4682 			     "%s-%d", name, task_pid_nr(current));
4683 	if (IS_ERR(thread))
4684 		return PTR_ERR(thread);
4685 
4686 	/* kthread_run is never supposed to return NULL */
4687 	WARN_ON(thread == NULL);
4688 
4689 	wait_for_completion(&init_context.init_done);
4690 
4691 	if (!init_context.err)
4692 		*thread_ptr = thread;
4693 
4694 	return init_context.err;
4695 }
4696